In [13]:
import time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch
from torch.utils.data import DataLoader
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import accuracy_score
from torch.nn import CrossEntropyLoss
import copy
import os

In [2]:
BATCH_SIZE = 16

DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda', index=2)

In [3]:
def set_seed(seed):
    print(f"Setting seed: {seed}")
    import numpy as np
    import torch
    import random

    # Set seeds
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensure deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 2025

## model & data preparation 

In [4]:
# Pretrained model
checkpoint = "google-bert/bert-large-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)

model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [5]:
###### loading data
raw_dataset = load_dataset("glue", "mrpc")
raw_dataset

# Create a tokenized dataset
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"],
                     padding="max_length", truncation=True, max_length=128)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["idx","sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
# tokenized_datasets

# small_train_dataset = tokenized_datasets["train"].select(range(3600))
# small_eval_dataset = tokenized_datasets["validation"].select(range(400))
# small_test_dataset = tokenized_datasets["test"].select(range(1700))

num_training_data = 1280
train_loader = DataLoader(dataset=tokenized_datasets["train"].select(range(num_training_data)),
                          batch_size=BATCH_SIZE,
                          shuffle=True)


# Create DataLoader for the test dataset
test_loader = DataLoader(dataset=tokenized_datasets["test"].select(range(num_training_data//4)),
                          batch_size=BATCH_SIZE,
                          shuffle=True)

Map: 100%|████████████████████████| 3668/3668 [00:00<00:00, 13845.25 examples/s]
Map: 100%|██████████████████████████| 408/408 [00:00<00:00, 10894.43 examples/s]
Map: 100%|████████████████████████| 1725/1725 [00:00<00:00, 14335.31 examples/s]


## Self-defined trainer

In [6]:
# Function to calculate accuracy
def compute_accuracy(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    true_labels = []
    predictions = []

    with torch.no_grad():  # Disable gradient calculation
        for batch in data_loader:
            # Move batch to the correct device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Get model predictions
            outputs = model(**batch)
            logits = outputs.logits
            
            # Get predicted class (highest logit value)
            preds = torch.argmax(logits, dim=-1)
            
            # Store true labels and predictions
            true_labels.extend(batch["labels"].cpu().numpy())
            predictions.extend(preds.cpu().numpy())
    
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy
    
def compute_loss(model, data_loader, device):
    model.eval()
    with torch.no_grad():
        loss_total = 0
        for batch in data_loader:
            # Move batch to the correct device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Get model predictions
            outputs = model(**batch)

            loss_total += outputs.loss.item()
        return loss_total/len(data_loader)

In [7]:
args = {"gamma":15,
       "lamba": 1e-3,
       "inner_lr":5e-4,
       "learning_rate":5e-4,
       "tokenized_train_dataset": tokenized_datasets["train"],
       "tokenized_test_dataset": tokenized_datasets["test"],
       "prop_train": 0.66,
       "max_time": 1800,
       "num_inner_step":1
       }

def train(num_epochs, model, optimizer, train_loader, test_loader, args = args, device=DEVICE ):
    max_time = args["max_time"]
    # record
    tr_acc=[compute_accuracy(model, train_loader, device)]
    tr_loss=[compute_loss(model, train_loader, device)]
    
    test_acc=[compute_accuracy(model, test_loader, device)]
    test_loss=[compute_loss(model, test_loader, device)]
    
    time_stamp=[0]
    
    for epoch in range(num_epochs):
        start_time = time.time()
        if time_stamp[-1]>=max_time:
            break
        model.train()
        print(f"Epoch {epoch+1}/{num_epochs}")
        for batch in train_loader:
            # Move data to the correct device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss # automatically cross entropy
    
            # Backward pass and optimization
            optimizer.zero_grad()

            loss.backward()

            # UPDATE MODEL PARAMETERS
            optimizer.step()

        # recording
        train_time = (time.time() - start_time)
        time_stamp.append(time_stamp[-1]+train_time)

        tr_acc.append(compute_accuracy(model, train_loader, device))
        tr_loss.append(compute_loss(model, train_loader, device))
        
        test_acc.append(compute_accuracy(model, test_loader, device))
        test_loss.append(compute_loss(model, test_loader, device))
        
        print('Time elapsed: %.2f min' % (time_stamp[-1]/60))
        print(f"Epoch {epoch+1} Training Loss: {tr_loss[-1]}")
        print(f"Epoch {epoch+1} Training Accuracy: {tr_acc[-1]}")
        
    return (tr_acc,tr_loss), (test_acc,test_loss), (time_stamp)

## Trian BiDoRa PBGD

In [8]:
def compute_regularization(B, A, device = DEVICE):
        V = torch.matmul(B, A)  # V = B @ A
        delta_V = torch.zeros_like(V)  # Placeholder for Delta V, update based on the iteration context
        reg_term = torch.norm((V + delta_V).T @ (V + delta_V) - torch.eye(V.shape[1], device=device), p='fro')**2
        # reg_term = torch.norm(V.T @ V - torch.eye(V.shape[1], device=device), p='fro')**2
        return reg_term

def compute_val_loss(model,val_loader, device=DEVICE):
    model.train()
    # Validation Loss Computation
    batch = next(iter(val_loader))

    batch = {k: v.to(device) for k, v in batch.items()}
            
    # Forward pass
    outputs = model(**batch)
    loss = outputs.loss # automatically cross entropy
    return loss #/batch.size(0)

def freeze_layers(model, freeze_lora_magnitude=True, freeze_lora_embedding=True):
    # Freeze lora_magnitude_vector
    if freeze_lora_magnitude:
        for name, param in model.named_parameters():
            if 'lora_magnitude_vector' in name:
                param.requires_grad = False
            if 'lora_embedding_A' in name or 'lora_embedding_B' in name:
                param.requires_grad = True
    
    # Freeze lora_embedding_A and lora_embedding_B
    if freeze_lora_embedding:
        for name, param in model.named_parameters():
            if 'lora_magnitude_vector' in name:
                param.requires_grad = True
            if 'lora_embedding_A' in name or 'lora_embedding_B' in name:
                param.requires_grad = False

In [9]:
args = {"gamma":15,
       "lamba": 1e-3,
       "inner_lr":5e-4,
       "learning_rate":5e-4,
       "tokenized_train_dataset": tokenized_datasets["train"],
       "tokenized_test_dataset": tokenized_datasets["test"],
       "prop_train": 0.66,
       "max_time": 1800,
       "num_inner_step":1
       }

def train_bidora_penalty(num_epochs, model, args = args, device=DEVICE, penalty_term=True):
    gamma=args["gamma"]
    lamba=args["lamba"]
    inner_lr=args["inner_lr"]
    learning_rate=args["learning_rate"]
    tokenized_train_dataset=args["tokenized_train_dataset"]
    tokenized_test_dataset=args["tokenized_test_dataset"]
    prop_train=args["prop_train"]
    max_time = args["max_time"]
    num_inner_step = args["num_inner_step"]
    
    train_size = int(prop_train * len(tokenized_train_dataset))
    val_size = len(tokenized_train_dataset) - train_size
    ab_train_dataset = tokenized_train_dataset.select(range(train_size))
    m_train_dataset = tokenized_train_dataset.select(range(train_size,train_size+val_size))
    # Create DataLoaders for training and validation
    train_loader = DataLoader(dataset=ab_train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
    
    val_loader = DataLoader(dataset=m_train_dataset,
                                   batch_size=BATCH_SIZE,
                                   shuffle=False)

    test_loader = DataLoader(dataset=tokenized_test_dataset,
                                   batch_size=BATCH_SIZE,
                                   shuffle=False)
    # record
    tr_acc=[compute_accuracy(model, train_loader, device)]
    tr_loss=[compute_loss(model, train_loader, device)]
    
    test_acc=[compute_accuracy(model, test_loader, device)]
    test_loss=[compute_loss(model, test_loader, device)]
    
    time_stamp=[0]
    
    for epoch in range(num_epochs):
        start_time = time.time()
        # model.train()
        print(f"Epoch {epoch+1}/{num_epochs}")

        if time_stamp[-1]>=max_time:
            break
        
        total_loss = 0
        for batch_val in val_loader:
            batch_val = {k: v.to(device) for k, v in batch_val.items()}

            # Phase 1: Update A and B, freeze m
            freeze_layers(model, freeze_lora_magnitude=True, freeze_lora_embedding=False)
            
            optimizer_ab = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=inner_lr)
            
            ### LL inner loop
            if penalty_term:
                model_LL = copy.deepcopy(model) # it has been freezed accordingly
                optimizer_LL = torch.optim.Adam(filter(lambda p: p.requires_grad, model_LL.parameters()), lr=inner_lr)
                
            for i in range(num_inner_step):#(int(prop_train/(1-prop_train))): # as number of train data is 9 times more than validation data
                model.train()

                batch = next(iter(train_loader))
                batch = {k: v.to(device) for k, v in batch.items()}

                # Forward and backward pass
                outputs = model(**batch)
                loss_train = outputs.loss
                
                # loss_val = compute_val_loss(model)
                outputs_val = model(**batch_val)
                loss_val = outputs_val.loss

                loss_reg = 0
                for name, param in model.named_parameters():
                    for name2, param2 in model.named_parameters():
                        if "lora_embedding_A" in name and "lora_embedding_B" in name2 and name[7]==name2[7]: # 7 is the layer number
                            loss_reg += compute_regularization(param, param2)
                # print(loss_train,loss_val,loss_reg)
                loss = 1/gamma* loss_val  + loss_train + lamba* loss_reg


                optimizer_ab.zero_grad()
                loss.backward()
                optimizer_ab.step()

                if penalty_term:
                    model_LL.train()
                    outputs = model_LL(**batch)
                    loss_train_LL = outputs.loss

                    optimizer_LL.zero_grad()
                    loss_train_LL.backward()

                    optimizer_LL.step()


            # Phase 2:
            # Freeze all ab, unfreeze m
            freeze_layers(model, freeze_lora_magnitude=False, freeze_lora_embedding=True)

            optimizer_m = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

            # upper loss
            outputs_m = model(**batch_val)
            loss_m = outputs_m.loss


            #penalty term
            if penalty_term:
                freeze_layers(model_LL, freeze_lora_magnitude=False, freeze_lora_embedding=True)
                optimizer_m = torch.optim.Adam(filter(lambda p: p.requires_grad, list(model.parameters()) + list(model_LL.parameters())),lr=1e-3)

                batch = next(iter(train_loader))
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs_LL = model_LL(**batch)
                outputs = model(**batch)
                
                # Forward and backward pass
                loss_train_LL = outputs_LL.loss
                loss_train = outputs.loss

                loss_m += gamma*(loss_train-loss_train_LL)

            
            total_loss += loss_m
            
            optimizer_m.zero_grad()
            loss_m.backward()
            optimizer_m.step()

        # recording
        train_time = (time.time() - start_time)
        time_stamp.append(time_stamp[-1]+train_time)

        tr_acc.append(compute_accuracy(model, train_loader, device))
        tr_loss.append(compute_loss(model, train_loader, device))
        
        test_acc.append(compute_accuracy(model, test_loader, device))
        test_loss.append(compute_loss(model, test_loader, device))
        
        print('Time elapsed: %.2f min' % (time_stamp[-1]/60))
        print(f"Epoch {epoch+1} Training Loss: {tr_loss[-1]}")
        print(f"Epoch {epoch+1} Training Accuracy: {tr_acc[-1]}")
        
    return (tr_acc,tr_loss), (test_acc,test_loss), (time_stamp)

## Train BiDoRa-Origin

In [10]:
args = {"gamma":15,
       "lamba": 1e-3,
       "inner_lr":5e-4,
       "learning_rate":5e-4,
       "tokenized_train_dataset": tokenized_datasets["train"],
       "tokenized_test_dataset": tokenized_datasets["test"],
       "prop_train": 0.66,
       "max_time": 1800,
       "num_inner_step":1
       }

def train_bidora_approx(num_epochs, model, epsilon=1e-3, args = args, device=DEVICE,xi = 0.1):
    # This is the algorithm that was used in the biDoRa paper
    gamma=args["gamma"]
    lamba=args["lamba"]
    inner_lr=args["inner_lr"]
    learning_rate=args["learning_rate"]
    tokenized_train_dataset=args["tokenized_train_dataset"]
    tokenized_test_dataset=args["tokenized_test_dataset"]
    prop_train=args["prop_train"]
    max_time = args["max_time"]
    num_inner_step = args["num_inner_step"]
    
    train_size = int(prop_train * len(tokenized_train_dataset))
    val_size = len(tokenized_train_dataset) - train_size
    ab_train_dataset = tokenized_train_dataset.select(range(train_size))
    m_train_dataset = tokenized_train_dataset.select(range(train_size,train_size+val_size))
    # Create DataLoaders for training and validation
    train_loader = DataLoader(dataset=ab_train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
    
    val_loader = DataLoader(dataset=m_train_dataset,
                                   batch_size=BATCH_SIZE,
                                   shuffle=False)

    test_loader = DataLoader(dataset=tokenized_test_dataset,
                                   batch_size=BATCH_SIZE,
                                   shuffle=False)
    # record
    tr_acc=[compute_accuracy(model, train_loader, device)]
    tr_loss=[compute_loss(model, train_loader, device)]
    
    test_acc=[compute_accuracy(model, test_loader, device)]
    test_loss=[compute_loss(model, test_loader, device)]
    
    time_stamp=[0]
    
    for epoch in range(num_epochs):
        start_time = time.time()
        # model.train()
        print(f"Epoch {epoch+1}/{num_epochs}")
        if time_stamp[-1]>=max_time:
            break
        
        total_loss = 0
        for batch_val in val_loader:
            batch_val = {k: v.to(device) for k, v in batch_val.items()}

            # Phase 1: Update A and B, freeze m
            freeze_layers(model, freeze_lora_magnitude=True, freeze_lora_embedding=False)
            
            optimizer_ab = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=inner_lr)
            
            ### LL inner loop
            for i in range(num_inner_step):#(int(prop_train/(1-prop_train))): # as number of train data is 9 times more than validation data
                model.train()

                batch = next(iter(train_loader))
                batch = {k: v.to(device) for k, v in batch.items()}

                # Forward and backward pass
                outputs = model(**batch)
                loss_train = outputs.loss
                
                # loss_val = compute_val_loss(model)
                outputs_val = model(**batch_val)
                loss_val = outputs_val.loss

                loss_reg = 0
                for name, param in model.named_parameters():
                    for name2, param2 in model.named_parameters():
                        if "lora_embedding_A" in name and "lora_embedding_B" in name2 and name[7]==name2[7]: # 7 is the layer number
                            loss_reg += compute_regularization(param, param2)
                # print(loss_train,loss_val,loss_reg)
                loss = 1/gamma* loss_val  + loss_train + lamba* loss_reg


                optimizer_ab.zero_grad()
                loss.backward()
                optimizer_ab.step()

            # Phase 2:
            # Freeze all ab, unfreeze m
            freeze_layers(model, freeze_lora_magnitude=False, freeze_lora_embedding=True)

            optimizer_m = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

            # upper loss
            outputs_m = model(**batch_val)
            loss_m = outputs_m.loss

            grad_v = torch.autograd.grad(loss_m, filter(lambda p: p.requires_grad, model.parameters()), create_graph=True)

            # keep the m params at this status
            original_model_params = [param.clone() for param in filter(lambda p: p.requires_grad, model.parameters())]
            
            # Compute perturbed M^+ and M^-
            M_plus = [param + epsilon * grad for param, grad in zip(filter(lambda p: p.requires_grad, model.parameters()), grad_v)]
            M_minus = [param - epsilon * grad for param, grad in zip(filter(lambda p: p.requires_grad, model.parameters()), grad_v)]
            
            ### Compute L_tr(M^+) and L_tr(M^-)
            # Replace model parameters with M^+ and compute training loss
            with torch.no_grad():
                for param, new_param in zip(filter(lambda p: p.requires_grad, model.parameters()), M_plus):
                    param.data.copy_(new_param.data)
            outputs_plus = model(**batch)
            train_loss_plus = outputs_plus.loss
        
            # Replace model parameters with M^- and compute training loss
            with torch.no_grad():
                for param, new_param in zip(filter(lambda p: p.requires_grad, model.parameters()), M_minus):
                    param.data.copy_(new_param.data)
            outputs_minus = model(**batch)
            train_loss_minus = outputs_minus.loss
            
            # Reset model parameters to original M (unperturbed state)
            with torch.no_grad():
                for param, original_param in zip(filter(lambda p: p.requires_grad, model.parameters()), original_model_params):
                    param.data.copy_(original_param.data)
        
            # Compute gradients of L_tr(M^+) and L_tr(M^-)
            grad_tr_plus = torch.autograd.grad(train_loss_plus, filter(lambda p: p.requires_grad, model.parameters()), create_graph=True)
            grad_tr_minus = torch.autograd.grad(train_loss_minus, filter(lambda p: p.requires_grad, model.parameters()), create_graph=True)
            
            # Compute second-order approximation
            grad_approx = []
            for grad_plus, grad_minus in zip(grad_tr_plus, grad_tr_minus):
                grad_approx.append((grad_plus - grad_minus) / (2 * epsilon))  # Central difference
            
            # Compute the final gradient approximation
            final_grad = []
            for grad_v, grad_m in zip(grad_v, grad_approx):
                final_grad.append(grad_v - xi * grad_m)  # Final gradient
            
            # Zero the gradients before backward pass
            optimizer_m.zero_grad()
            
            # Manually accumulate gradients for each parameter
            for param, grad in zip(filter(lambda p: p.requires_grad, model.parameters()), final_grad):
                param.grad = grad  # Set the manually computed gradient for each parameter
            
            # Perform the optimizer step to update the parameters using Adam
            optimizer_m.step()

        # recording
        train_time = (time.time() - start_time)
        time_stamp.append(time_stamp[-1]+train_time)

        tr_acc.append(compute_accuracy(model, train_loader, device))
        tr_loss.append(compute_loss(model, train_loader, device))
        
        test_acc.append(compute_accuracy(model, test_loader, device))
        test_loss.append(compute_loss(model, test_loader, device))
        
        print('Time elapsed: %.2f min' % (time_stamp[-1]/60))
        print(f"Epoch {epoch+1} Training Loss: {tr_loss[-1]}")
        print(f"Epoch {epoch+1} Training Accuracy: {tr_acc[-1]}")
        
    return (tr_acc,tr_loss), (test_acc,test_loss), (time_stamp)

## Train DoRa, BiDoRa

### DoRa

In [11]:
num_exp = 3
num_epochs = 15
learning_rate =5e-4

output_dora_list = []

checkpoint = "google-bert/bert-large-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
model_dora = copy.deepcopy(model) # AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
peft_config_dora = LoraConfig(use_dora=True, task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model_dora = get_peft_model(model_dora, peft_config_dora)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
args = {"gamma":15,
       "lamba": 1e-3,
       "inner_lr":5e-4,
       "learning_rate":5e-4,
       "tokenized_train_dataset": tokenized_datasets["train"],
       "tokenized_test_dataset": tokenized_datasets["test"],
       "prop_train": 0.66,
       "max_time": 3600,
       "num_inner_step":1
       }

for i in range(num_exp):
    set_seed(i)
    print("random_seed:",i)
    print("========Iteration:",i,"=========")
    model_dora_=copy.deepcopy(model_dora)
    model_dora_.to(DEVICE)
    optimizer = torch.optim.Adam(model_dora_.parameters(), lr=learning_rate)
    output_dora = train(num_epochs, model_dora_, optimizer, train_loader, test_loader, args = args)
    output_dora_list.append(output_dora)
    print(f'Test accuracy DoRA finetune: {compute_accuracy(model_dora_, test_loader, DEVICE):.2f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting seed: 0
random_seed: 0
Epoch 1/15
Time elapsed: 0.66 min
Epoch 1 Training Loss: 0.5340764075517654
Epoch 1 Training Accuracy: 0.75859375
Epoch 2/15
Time elapsed: 1.32 min
Epoch 2 Training Loss: 0.36950838342309
Epoch 2 Training Accuracy: 0.8546875
Epoch 3/15
Time elapsed: 1.97 min
Epoch 3 Training Loss: 0.21372515708208084
Epoch 3 Training Accuracy: 0.93984375
Epoch 4/15
Time elapsed: 2.62 min
Epoch 4 Training Loss: 0.10377543163485825
Epoch 4 Training Accuracy: 0.97265625
Epoch 5/15
Time elapsed: 3.27 min
Epoch 5 Training Loss: 0.0918227544403635
Epoch 5 Training Accuracy: 0.96796875
Epoch 6/15
Time elapsed: 3.92 min
Epoch 6 Training Loss: 0.03835576356505044
Epoch 6 Training Accuracy: 0.9859375
Epoch 7/15
Time elapsed: 4.56 min
Epoch 7 Training Loss: 0.013878559251315892
Epoch 7 Training Accuracy: 0.996875
Epoch 8/15
Time elapsed: 5.21 min
Epoch 8 Training Loss: 0.0037915125547442587
Epoch 8 Training Accuracy: 0.9984375
Epoch 9/15
Time elapsed: 5.86 min
Epoch 9 Training Loss:

In [14]:
output_dir = "output/"
os.makedirs(output_dir, exist_ok=True)
with open(output_dir+"output_dora_list.txt", "w") as f:
    f.write(str(output_dora_list))

In [16]:
output_dir = "output/model/"
os.makedirs(output_dir, exist_ok=True)
torch.save(model_dora_.state_dict(), output_dir+"model_dora.pth")

### BiDoRa PBGD & PBGD_Free

In [None]:
output_bidora_PBGD_Free_list = []
# num_exp = 10
# num_epochs = 10
# learning_rate =5e-4

checkpoint = "google-bert/bert-large-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
model_dora = copy.deepcopy(model) # AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
peft_config_dora = LoraConfig(use_dora=True, task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model_dora = get_peft_model(model_dora, peft_config_dora)

# args = {"gamma":15,
#        "lamba": 5e-4,
#        "inner_lr":5e-4,
#        "learning_rate":5e-4,
#        "tokenized_train_dataset": tokenized_datasets["train"],
#        "tokenized_test_dataset": tokenized_datasets["test"],
#        "prop_train": 0.66}

for i in range(num_exp):
    set_seed(i)
    print("random_seed:",i)
    print("========Iteration:",i,"=========")
    model_dora_=copy.deepcopy(model_dora)
    model_dora_.to(DEVICE)
    output_dora = train_bidora_penalty(num_epochs, model_dora_,args=args,penalty_term=False)
    output_bidora_PBGD_Free_list.append(output_dora)
    print(f'Test accuracy DoRA finetune: {compute_accuracy(model_dora_, test_loader, DEVICE):.2f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting seed: 0
random_seed: 0


In [None]:
output_dir = "output/model/"
torch.save(model_dora_.state_dict(), output_dir+"model_bidora_PBGD_Free.pth")

In [None]:
output_dir = "output/"
with open(output_dir+"output_bidora_PBGD_Free_list.txt", "w") as f:
    f.write(str(output_bidora_PBGD_Free_list))

In [None]:
output_bidora_list = []

for i in range(num_exp):
    set_seed(i)
    print("random_seed:",i)
    print("========Iteration:",i,"=========")
    model_dora_=copy.deepcopy(model_dora)
    model_dora_.to(DEVICE)
    output_dora = train_bidora_penalty(num_epochs, model_dora_,args=args,penalty_term=True)
    output_bidora_list.append(output_dora)
    print(f'Test accuracy DoRA finetune: {compute_accuracy(model_dora_, test_loader, DEVICE):.2f}')

In [None]:
output_dir = "output/model/"
torch.save(model_dora_.state_dict(), output_dir+"model_bidora.pth")

In [None]:
output_dir = "output/"
with open(output_dir+"output_bidora_list.txt", "w") as f:
    f.write(str(output_bidora_list))

### BiDoRa Origin

In [None]:
checkpoint = "google-bert/bert-large-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
model_dora = copy.deepcopy(model) # AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)
peft_config_dora = LoraConfig(use_dora=True, task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model_dora = get_peft_model(model_dora, peft_config_dora)
# args = {"gamma":15,
#        "lamba": 1e-3,
#        "inner_lr":5e-4,
#        "learning_rate":5e-4,
#        "tokenized_train_dataset": tokenized_datasets["train"].select(range(320)),
#        "tokenized_test_dataset": tokenized_datasets["test"].select(range(160)),
#        "prop_train": 0.66}


# output_bidora_appx_list = []
# num_exp = 3 
# num_epochs =10
for i in range(num_exp):
    set_seed(i)
    print("random_seed:",i)
    print("========Iteration:",i,"=========")
    model_dora_=copy.deepcopy(model_dora)
    model_dora_.to(DEVICE)
    output_dora = train_bidora_approx(num_epochs, model_dora_,args= args)
    output_bidora_appx_list.append(output_dora)
    print(f'Test accuracy DoRA finetune: {compute_accuracy(model_dora_, test_loader, DEVICE):.2f}')

In [None]:
output_dir = "output/model/"
torch.save(model_dora_.state_dict(), output_dir+"model_bidora_origin.pth")
output_dir = "output/"
with open(output_dir+"output_bidora_origin_list.txt", "w") as f:
    f.write(str(output_bidora_list))

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
output = train(num_epochs, model, optimizer, train_loader, DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}')

In [None]:
output_dir = "output/"
with open(output_dir+"output.txt", "w") as f:
    f.write(str(output))

In [None]:
output_dir = "output/"
with open(output_dir+"output_lora.txt", "w") as f:
    f.write(str(output_lora))

# Output

In [None]:
output_dir = "output/"
with open(output_dir+"output_dora_list.txt", "r") as f:
    output_dora_list = eval(f.read())
    
with open(output_dir+"output_bidora_PBGD_Free_list.txt", "r") as f:
    output_bidora_PBGD_Free_list = eval(f.read())

with open(output_dir+"output_bidora_list.txt", "r") as f:
    output_bidora_list = eval(f.read())

In [None]:
def extract_and_pad(output_lists):
    acc_lists = []
    max_len = max(len(lst) for lst in output_lists)

    for lst in output_lists:
        if len(lst) == max_len:
            acc_lists.append(lst)
        else:
            lst += [lst[-1]] * (max_len - len(lst))  # pad with last value
            acc_lists.append(lst)

    return np.array(acc_lists)

In [None]:
test_acc_dora = []
for i in range(len(output_dora_list)):
    test_acc_dora.append(output_dora_list[i][1][0])
test_acc_dora = extract_and_pad(test_acc_dora)

test_acc_bidora = []
for i in range(len(output_bidora_list)):
    test_acc_bidora.append(output_bidora_list[i][1][0])
test_acc_bidora = extract_and_pad(test_acc_bidora)

test_acc_bidora_PBGD_Free = []
for i in range(len(output_bidora_PBGD_Free_list)):
    test_acc_bidora_PBGD_Free.append(output_bidora_PBGD_Free_list[i][1][0])
test_acc_bidora_PBGD_Free = extract_and_pad(test_acc_bidora_PBGD_Free)

In [None]:
print(test_acc_bidora_PBGD_Free.mean(0)[-1],test_acc_bidora_PBGD_Free.std(0)[-1])
print(test_acc_bidora.mean(0)[-1],test_acc_bidora.std(0)[-1])
print(test_acc_dora.mean(0)[-1],test_acc_dora.std(0)[-1])

# Movie

In [None]:
raw_datasets = load_dataset("imdb") 
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", padding="max_length", truncation=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

train_loader = DataLoader(dataset= small_train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)


# Create DataLoader for the test dataset
test_loader = DataLoader(dataset= small_eval_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

In [None]:
# Pretrained model
checkpoint = "google-bert/bert-large-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels = 2)

model