In [16]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    TaskType,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm


import warnings

warnings.filterwarnings("ignore")

In [17]:
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(2999)  # 设置固定随机种子

In [18]:
torch.cuda.empty_cache()

In [19]:
device = "cuda"
batch_size = 8
model_name_or_path = "FacebookAI/roberta-large"
task = "stsb"
peft_type = PeftType.LORA
num_epochs = 30
lr = 2e-4

In [20]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=512)
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

In [21]:
torch.cuda.empty_cache()

In [22]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# target_modules = ["query", "value"]
chain_epoch = 8

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, num_labels=1)
model.config.problem_type = "regression"
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,837,057 || all params: 357,197,826 || trainable%: 0.5143


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A

In [24]:
# lambda_orth = 0.02
lambda_orth = 0.02
# lr_b = lambad_lr * lr_a
lambad_lr = 8.0
# 每隔 ？ step 计算一次正交性损失
orth_interval = 5  

In [25]:
# 获取LoRA参数
lora_params = {n: p for n, p in model.named_parameters() if 'lora' in n}
# 分组LoRA参数
param_groups = [
    {"params": [p for n, p in lora_params.items() if 'lora_A' in n], "lr": lr},  # A矩阵的学习率
    {"params": [p for n, p in lora_params.items() if 'lora_B' in n], "lr": lambad_lr * lr},  # B矩阵的学习率
]

beta1 = 0.9
beta2 = 0.99

optimizer = AdamW(param_groups, betas=(beta1, beta2))

#optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * chain_epoch),
    num_training_steps=(len(train_dataloader) * chain_epoch),
)

In [26]:
saved_lora_a_matrices = []  # 用于保存之前训练的 LoRA 模块的 A 矩阵
saved_lora_b_matrices = []  # 用于保存之前训练的 LoRA 模块的 B 矩阵

def extract_lora_a_matrices(model):
    """提取 LoRA 的 A 矩阵"""
    lora_matrices = []
    for name, param in model.named_parameters():
        if "lora_A" in name:  # 识别 LoRA 的 A 矩阵
            lora_matrices.append(param.detach().clone())  # 保存当前 A 矩阵
    return lora_matrices

def extract_lora_b_matrices(model):
    """提取 LoRA 的 B 矩阵"""
    lora_matrices = []
    for name, param in model.named_parameters():
        if "lora_B" in name:  # 识别 LoRA 的 B 矩阵
            lora_matrices.append(param.detach().clone())  # 保存当前 B 矩阵
    return lora_matrices

In [27]:
def orthogonality_loss(current_matrices, saved_matrices):
    """计算当前 LoRA 和之前所有 LoRA 的正交性损失"""
    loss = 0
    for B_t in current_matrices:
        for B_i in saved_matrices:
            # 计算 A_t 和 A_i 的正交性
            product = torch.matmul(B_t.T, B_i)  # 矩阵乘积
            loss += torch.norm(product, p="fro")  # Frobenius 范数
    return loss


In [28]:
import re

def apply_shared_lora_a_matrices(model, saved_lora_a_matrices):
    """
    将保存的 LoRA A 矩阵应用到当前模型中，并冻结这些参数。
    """
    lora_idx = 0
    for name, param in model.named_parameters():
        if 'lora_A' in name:  #  A 矩阵的名称包含 'lora_A'
            # 提取层和注意力头的信息，例如 "encoder.layer.0.attention.self.query.lora_A"
            layer_match = re.match(r'.*encoder.layer.(\d+)\.attention.self\.(query|value)\.lora_A', name)
            if layer_match:
                layer_num = int(layer_match.group(1))
                if layer_num < len(saved_lora_a_matrices) // 2:
                    shared_a = saved_lora_a_matrices[lora_idx]
                    param.data = shared_a.data.clone() 
                    # param.requires_grad = False
                    print(f"Applied shared LoRA A matrix for {name}")
                    
                    lora_idx += 1
                    

In [29]:
print(torch.initial_seed())

2999


In [30]:
model.to(device)
for epoch in range(num_epochs):
    if epoch != 0 and epoch % chain_epoch == 0:
        # 第一组lora的 A 矩阵作为共享 A 矩阵
        # epoch == chain_epoch // epoch % chain_epoch == 0
        if epoch == chain_epoch:
            saved_lora_a_matrices = extract_lora_a_matrices(model)
            print(f"Saved LoRA A matrices at epoch {epoch}")
        
        # 提取当前 LoRA 的 B 矩阵
        current_lora_b_matrices = extract_lora_b_matrices(model)
        # 将当前 B 矩阵保存到列表中
        saved_lora_b_matrices.extend(current_lora_b_matrices)
        orth_loss = 0.0
        
        # 合并
        model.merge_and_unload()
        peft_config.r = 8
        
        # 拓展
        model = get_peft_model(model, peft_config)
        # 共享 A
        if saved_lora_a_matrices:
            apply_shared_lora_a_matrices(model, saved_lora_a_matrices)
            # for name, param in model.named_parameters():
            #     print(f"{name}: requires_grad={param.requires_grad}")
            for name, param in model.named_parameters():
                if param.requires_grad:
                    print(f"Active parameter: {name}")

        
        #重新实例化优化器
        # 获取LoRA参数
        lora_params = {n: p for n, p in model.named_parameters() if 'lora' in n}
        # 分组LoRA参数
        param_groups = [
            {"params": [p for n, p in lora_params.items() if 'lora_A' in n], "lr": lr},  # A矩阵的学习率
            {"params": [p for n, p in lora_params.items() if 'lora_B' in n], "lr": lambad_lr * lr},  # B矩阵的学习率
        ]
        optimizer = AdamW(param_groups, betas=(beta1, beta2))
        # optimizer = AdamW(params=model.parameters(), lr=lr)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0.06 * (len(train_dataloader) * chain_epoch),
            num_training_steps=(len(train_dataloader) * chain_epoch),
        )
        print(f"new lora, r = {peft_config.r}:")
        torch.cuda.empty_cache()
        
        
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        
        loss = outputs.loss
        # task_loss = outputs.loss
        if saved_lora_b_matrices and step % orth_interval == 0 and step != 0:
            current_lora_b_matrices = extract_lora_b_matrices(model)
            orth_loss = orthogonality_loss(current_lora_b_matrices, saved_lora_b_matrices)
        else:
            orth_loss = 0.0
        # 总损失
        loss = loss + lambda_orth * orth_loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        # predictions = outputs.logits.argmax(dim=-1)
        # predictions, references = predictions, batch["labels"]
        predictions = outputs.logits.squeeze().cpu().numpy()
        references = batch["labels"].cpu().numpy() 
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/719 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 719/719 [00:22<00:00, 31.55it/s]
100%|██████████| 188/188 [00:02<00:00, 77.85it/s]


epoch 0: {'pearson': 0.9011675370298733, 'spearmanr': 0.8979504872144392}


100%|██████████| 719/719 [00:22<00:00, 32.39it/s]
100%|██████████| 188/188 [00:02<00:00, 77.21it/s]


epoch 1: {'pearson': 0.8973321643834214, 'spearmanr': 0.9053615563792489}


100%|██████████| 719/719 [00:22<00:00, 32.12it/s]
100%|██████████| 188/188 [00:02<00:00, 77.90it/s]


epoch 2: {'pearson': 0.8922727238960583, 'spearmanr': 0.898766359662877}


100%|██████████| 719/719 [00:22<00:00, 31.81it/s]
100%|██████████| 188/188 [00:02<00:00, 78.86it/s]


epoch 3: {'pearson': 0.90890638276731, 'spearmanr': 0.9161167114787687}


100%|██████████| 719/719 [00:22<00:00, 32.58it/s]
100%|██████████| 188/188 [00:02<00:00, 78.62it/s]


epoch 4: {'pearson': 0.921072718030693, 'spearmanr': 0.9185061775384523}


100%|██████████| 719/719 [00:22<00:00, 32.38it/s]
100%|██████████| 188/188 [00:02<00:00, 78.50it/s]


epoch 5: {'pearson': 0.9208138311240046, 'spearmanr': 0.92005767682842}


100%|██████████| 719/719 [00:22<00:00, 32.25it/s]
100%|██████████| 188/188 [00:02<00:00, 77.53it/s]


epoch 6: {'pearson': 0.9215812107710624, 'spearmanr': 0.9199664632424217}


100%|██████████| 719/719 [00:23<00:00, 30.99it/s]
100%|██████████| 188/188 [00:02<00:00, 70.33it/s]


epoch 7: {'pearson': 0.9227243780943744, 'spearmanr': 0.9196596729953881}
Saved LoRA A matrices at epoch 8
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.roberta.encoder.layer.

100%|██████████| 719/719 [00:30<00:00, 23.58it/s]
100%|██████████| 188/188 [00:02<00:00, 74.11it/s]


epoch 8: {'pearson': 0.9212019704240686, 'spearmanr': 0.918529786095649}


100%|██████████| 719/719 [00:29<00:00, 24.72it/s]
100%|██████████| 188/188 [00:02<00:00, 79.43it/s]


epoch 9: {'pearson': 0.9204851367185836, 'spearmanr': 0.9170069914517998}


100%|██████████| 719/719 [00:29<00:00, 24.10it/s]
100%|██████████| 188/188 [00:02<00:00, 66.62it/s]


epoch 10: {'pearson': 0.9170480134919688, 'spearmanr': 0.9139787829589986}


100%|██████████| 719/719 [00:30<00:00, 23.53it/s]
100%|██████████| 188/188 [00:02<00:00, 77.66it/s]


epoch 11: {'pearson': 0.9185378353908915, 'spearmanr': 0.9173619053232346}


100%|██████████| 719/719 [00:29<00:00, 24.17it/s]
100%|██████████| 188/188 [00:02<00:00, 77.51it/s]


epoch 12: {'pearson': 0.917109645475319, 'spearmanr': 0.9152421023350439}


100%|██████████| 719/719 [00:30<00:00, 23.87it/s]
100%|██████████| 188/188 [00:02<00:00, 73.40it/s]


epoch 13: {'pearson': 0.9202059254346818, 'spearmanr': 0.9172901197608169}


100%|██████████| 719/719 [00:30<00:00, 23.70it/s]
100%|██████████| 188/188 [00:02<00:00, 77.34it/s]


epoch 14: {'pearson': 0.9221988953441175, 'spearmanr': 0.9192400855175872}


100%|██████████| 719/719 [00:29<00:00, 23.97it/s]
100%|██████████| 188/188 [00:02<00:00, 78.51it/s]


epoch 15: {'pearson': 0.9218704412983547, 'spearmanr': 0.9195656747099619}
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight
Applied shared LoRA

100%|██████████| 719/719 [00:36<00:00, 19.96it/s]
100%|██████████| 188/188 [00:02<00:00, 79.32it/s]


epoch 16: {'pearson': 0.9149693142714489, 'spearmanr': 0.9176299687494966}


100%|██████████| 719/719 [00:35<00:00, 20.00it/s]
100%|██████████| 188/188 [00:02<00:00, 77.99it/s]


epoch 17: {'pearson': 0.9176735175766283, 'spearmanr': 0.9164859483880591}


100%|██████████| 719/719 [00:36<00:00, 19.77it/s]
100%|██████████| 188/188 [00:02<00:00, 79.72it/s]


epoch 18: {'pearson': 0.9180746915358111, 'spearmanr': 0.9144052031817224}


100%|██████████| 719/719 [00:36<00:00, 19.84it/s]
100%|██████████| 188/188 [00:02<00:00, 80.65it/s]


epoch 19: {'pearson': 0.9182799444025239, 'spearmanr': 0.9182325794851138}


100%|██████████| 719/719 [00:35<00:00, 20.01it/s]
100%|██████████| 188/188 [00:02<00:00, 73.70it/s]


epoch 20: {'pearson': 0.9206504624082789, 'spearmanr': 0.9181823720816182}


100%|██████████| 719/719 [00:37<00:00, 19.43it/s]
100%|██████████| 188/188 [00:02<00:00, 78.29it/s]


epoch 21: {'pearson': 0.9202879318153517, 'spearmanr': 0.917738216155064}


100%|██████████| 719/719 [00:36<00:00, 19.84it/s]
100%|██████████| 188/188 [00:02<00:00, 77.14it/s]


epoch 22: {'pearson': 0.9200369904083513, 'spearmanr': 0.9175361786492673}


100%|██████████| 719/719 [00:36<00:00, 19.67it/s]
100%|██████████| 188/188 [00:02<00:00, 75.29it/s]


epoch 23: {'pearson': 0.9201190328350677, 'spearmanr': 0.917492315629712}
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
Applied shared LoRA A matrix for base_model.model.base_model.model.base_model.model.ba

100%|██████████| 719/719 [00:44<00:00, 16.30it/s]
100%|██████████| 188/188 [00:02<00:00, 78.25it/s]


epoch 24: {'pearson': 0.9163537444858784, 'spearmanr': 0.9139291441347508}


100%|██████████| 719/719 [00:43<00:00, 16.56it/s]
100%|██████████| 188/188 [00:02<00:00, 78.87it/s]


epoch 25: {'pearson': 0.9191724866125699, 'spearmanr': 0.9157878553802279}


100%|██████████| 719/719 [00:42<00:00, 16.86it/s]
100%|██████████| 188/188 [00:02<00:00, 80.39it/s]


epoch 26: {'pearson': 0.9165651760046348, 'spearmanr': 0.913706617290989}


100%|██████████| 719/719 [00:43<00:00, 16.61it/s]
100%|██████████| 188/188 [00:02<00:00, 77.43it/s]


epoch 27: {'pearson': 0.9184705183606894, 'spearmanr': 0.9166881032568133}


100%|██████████| 719/719 [00:42<00:00, 16.82it/s]
100%|██████████| 188/188 [00:02<00:00, 77.75it/s]


epoch 28: {'pearson': 0.9171397032289896, 'spearmanr': 0.9150774883425847}


100%|██████████| 719/719 [00:43<00:00, 16.61it/s]
100%|██████████| 188/188 [00:02<00:00, 78.99it/s]

epoch 29: {'pearson': 0.9174715354421523, 'spearmanr': 0.9145238473916083}





In [31]:
# 这两个指标值域为[-1, 1]
# 通过公式 (score + 1) / 2 对结果进行转换

In [32]:
# seed 3407
# bz 8
# num_epochs 30
# chain_epoch 8
# lambda_lora = 8.0
# lambda_orth = 0.5
# orth_interval = 5 
# lr = 2e-4
# 第一组lora A矩阵训练好后共享
# best 0.922 -> 0.961

In [33]:
# seed 42
# bz 8
# num_epochs 30
# chain_epoch 8
# lambda_lora = 8.0
# lambda_orth = 0.5
# orth_interval = 5 
# lr = 2e-4
# 第一组lora A矩阵训练好后共享
# best 0.9206 -> 0.9603

In [34]:
# seed 2345
# bz 8
# num_epochs 30
# chain_epoch 8
# lambda_lora = 8.0
# lambda_orth = 0.5
# orth_interval = 5 
# lr = 2e-4
# 第一组lora A矩阵训练好后共享
# best 0.925

In [35]:
# seed 114514
# bz 8
# num_epochs 30
# chain_epoch 8
# lambda_lora = 8.0
# lambda_orth = 0.5
# orth_interval = 5 
# lr = 2e-4
# 第一组lora A矩阵训练好后共享
# best 0.92183

In [None]:
# seed 2999
# bz 8
# num_epochs 30
# chain_epoch 8
# lambda_lora = 8.0
# lambda_orth = 0.5
# orth_interval = 5 
# lr = 2e-4
# 第一组lora A矩阵训练好后共享
# best 0.922