In [1]:
import os
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import random
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch import nn
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
from peft import get_peft_model, PrefixTuningConfig, TaskType, LoraConfig
from utils import *

In [2]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0")

In [3]:
dataset_name = "sst2" #sst2, qnli qqp mnli
tokenized_data = load_cleaned_data(dataset_name)
train_dataloader = DataLoader(tokenized_data['train'], shuffle=False, batch_size=1024,collate_fn=collate_fn)
if dataset_name == "mnli":
    tokenized_data['validation'] = tokenized_data["validation_matched"]
val_dataloader = DataLoader(tokenized_data['validation'], shuffle=False, batch_size=1024,collate_fn=collate_fn)
num_labels = torch.unique(tokenized_data["train"]["labels"]).numel()
loss_fn = nn.CrossEntropyLoss()



  0%|          | 0/3 [00:00<?, ?it/s]



# Soft-Prompt and LoRA without DP

In [4]:
from soft_embedding import SoftEmbedding
from peft import PromptTuningConfig,PromptTuningInit

model_name = "prajjwal1/bert-tiny"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# for param in model.parameters():
#     param.requires_grad = False

# model.enable_input_require_grads()

prompt_tuning_init_text = "Classify the sentiment of this sentence"

peft_config = PromptTuningConfig(
    task_type="SEQ_CLS",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_name,
)
model = get_peft_model(model, peft_config)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,
    r=8,  # Low-rank adaptation dimension
    lora_alpha=32,
    lora_dropout=0.1,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if 'prompt_encoder' in name :
        param.requires_grad = True

optimizer = AdamW(model.parameters(), lr=1e-2)
epochs = 5

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * epochs),)


model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 9,474 || all params: 4,395,652 || trainable%: 0.21553116579747442


In [5]:
trainModel(model,optimizer,train_dataloader,val_dataloader,loss_fn,lr_scheduler,tqdm,dataset_name)

100%|██████████| 66/66 [00:17<00:00,  3.76it/s]


epoch=0: train_ppl=tensor(2.0050, device='cuda:0') train_epoch_loss=tensor(0.6956, device='cuda:0') 
Epoch 1, Validation Accuracy without DP: {'accuracy': 0.5126146788990825}


100%|██████████| 66/66 [00:18<00:00,  3.66it/s]


epoch=1: train_ppl=tensor(1.9602, device='cuda:0') train_epoch_loss=tensor(0.6730, device='cuda:0') 
Epoch 2, Validation Accuracy without DP: {'accuracy': 0.588302752293578}


100%|██████████| 66/66 [00:16<00:00,  4.08it/s]


epoch=2: train_ppl=tensor(1.8181, device='cuda:0') train_epoch_loss=tensor(0.5978, device='cuda:0') 
Epoch 3, Validation Accuracy without DP: {'accuracy': 0.7110091743119266}


100%|██████████| 66/66 [00:17<00:00,  3.82it/s]


epoch=3: train_ppl=tensor(1.7424, device='cuda:0') train_epoch_loss=tensor(0.5553, device='cuda:0') 
Epoch 4, Validation Accuracy without DP: {'accuracy': 0.7029816513761468}


100%|██████████| 66/66 [00:16<00:00,  3.94it/s]


epoch=4: train_ppl=tensor(1.7237, device='cuda:0') train_epoch_loss=tensor(0.5445, device='cuda:0') 
Epoch 5, Validation Accuracy without DP: {'accuracy': 0.7110091743119266}
Training complete!


# Soft-Prompt and LoRA with DP

In [6]:
from soft_embedding import SoftEmbedding
from peft import PromptTuningConfig,PromptTuningInit

model_name = "prajjwal1/bert-tiny"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# for param in model.parameters():
#     param.requires_grad = False

# model.enable_input_require_grads()

prompt_tuning_init_text = "Classify the sentiment of this sentence"

peft_config = PromptTuningConfig(
    task_type="SEQ_CLS",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_name,
)
model = get_peft_model(model, peft_config)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task
    inference_mode=False,
    r=8,  # Low-rank adaptation dimension
    lora_alpha=32,
    lora_dropout=0.1,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

for name, param in model.named_parameters():
    if 'prompt_encoder' in name :
        param.requires_grad = True

optimizer = AdamW(model.parameters(), lr=1e-2)
epochs = 5

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * epochs),)


model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 9,474 || all params: 4,395,652 || trainable%: 0.21553116579747442


In [7]:
merged_model = model.merge_and_unload()
dp_train_2(merged_model.base_model,train_dataloader,tokenized_data,optimizer,lr_scheduler,epochs,val_dataloader,dataset_name)

100%|██████████| 66/66 [00:14<00:00,  4.46it/s]


Epoch 1, Validation Accuracy DP: {'accuracy': 0.6341743119266054}


100%|██████████| 66/66 [00:13<00:00,  4.78it/s]


Epoch 2, Validation Accuracy DP: {'accuracy': 0.6444954128440367}


100%|██████████| 66/66 [00:14<00:00,  4.57it/s]


Epoch 3, Validation Accuracy DP: {'accuracy': 0.6353211009174312}


100%|██████████| 66/66 [00:13<00:00,  4.80it/s]


Epoch 4, Validation Accuracy DP: {'accuracy': 0.6399082568807339}


100%|██████████| 66/66 [00:14<00:00,  4.41it/s]


Epoch 5, Validation Accuracy DP: {'accuracy': 0.6605504587155964}
Training complete
