In [1]:
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoModel

import warnings

import numpy as np
import evaluate

warnings.filterwarnings("ignore")

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets


DataFilesNotFoundError: No (supported) data files found in glue

In [178]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[100]


{'sentence1': 'The Nasdaq composite index inched up 1.28 , or 0.1 percent , to 1,766.60 , following a weekly win of 3.7 percent .',
 'sentence2': 'The technology-laced Nasdaq Composite Index .IXIC was off 24.44 points , or 1.39 percent , at 1,739.87 .',
 'label': 0,
 'idx': 114}

In [179]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
def process_func(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=128)


In [6]:
tokenized_datasets = raw_datasets.map(process_func, batched=True)
tokenized_datasets

NameError: name 'raw_datasets' is not defined

In [7]:
# , remove_columns=raw_datasets.column_names
# 分词后应删去不必要的列
# 虽然对数据进行分词，但还需要对数据进行封装才可进入模型进行训练
# 这里不能调用pytorch的DataLoader，需要使用transformers 的 DataCollatorWithPadding

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


NameError: name 'tokenized_datasets' is not defined

In [185]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=128)


In [186]:
tokenized_datasets["train"][0]

{'labels': tensor(1),
 'input_ids': tensor([    0, 10127,  1001,  6182,  1238,    39,  2138,  2156,  2661,    37,
           373,    22,     5,  4562,    22,  2156,     9, 12507,  7018, 23817,
            39,  1283,   479,     2,     2, 48310,  4506,     7,   123,    25,
           129,    22,     5,  4562,    22,  2156,  1918,  1001,  6182,  1238,
            39,  2138,     9, 12507,  7018, 23817,    39,  1283,   479,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

In [187]:
tokenized_datasets["train"].features

{'labels': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [188]:
samples = tokenized_datasets["train"][:8]#取训练集前 8 列
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}#不需要这些列
[len(x) for x in samples["input_ids"]]#每一个样本的长度


[50, 60, 48, 67, 60, 52, 62, 33]

In [189]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67])}

In [190]:
# 如果是对序列进行分类就使用AutoModelForSequenceClassification；
# num_labels=2是我们要改输出层，输出层不用预训练模型了，输出层自己训练。

In [323]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [376]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): lora.Linear(
                (base_layer): Linear(in_features=768, out_features=768, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (defaul

In [377]:
for name, parameter in model.named_parameters():
    print(name)

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.base_layer.weight
roberta.encoder.layer.0.attention.self.query.base_layer.bias
roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.base_layer.weight
roberta.encoder.layer.0.attention.self.value.base_layer.bias
roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.

# lora

In [378]:
from peft import LoraConfig, TaskType, get_peft_model


In [379]:
config = LoraConfig(
    task_type = TaskType.SEQ_CLS, 
    target_modules = ["query", "value"], 
    lora_alpha = 32,
    r = 16,
    bias = "none",
    
    )
config
# modules_to_save= ["classifier.dense", "classifier.out_proj"],

LoraConfig(task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'query', 'value'}, exclude_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [380]:
lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


In [381]:
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
          

# 训练

In [382]:
seed = int(torch.randint(0, 2 ** 12 - 1, (1,)).item())
seed

37681

In [383]:
training_args = TrainingArguments(
    output_dir="./glue_mrpc",
    num_train_epochs=30,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_ratio = 0.1,
    learning_rate = 2e-4,
    optim = "adamw_hf",
    lr_scheduler_type = "linear",
    seed = seed,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end=True
    )

# 
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [384]:
def compute_metrics(pred):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)

In [385]:
trainer = Trainer(
    lora_model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

In [386]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2,0.435071,0.877451,0.911661
2,0.2178,0.322546,0.884804,0.916519
3,0.208,0.387554,0.875,0.909735
4,0.1851,0.408915,0.877451,0.913194
5,0.1777,0.369928,0.879902,0.91358
6,0.147,0.329807,0.875,0.910369
7,0.1421,0.380385,0.870098,0.908463
8,0.1117,0.489378,0.870098,0.909402
9,0.1,0.515682,0.894608,0.925217
10,0.1032,0.529502,0.877451,0.912281


Using the latest cached version of the module from C:\Users\shaoc\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--glue\05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Wed Nov 27 15:46:17 2024) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.


TrainOutput(global_step=6900, training_loss=0.08220917936684428, metrics={'train_runtime': 285.739, 'train_samples_per_second': 385.107, 'train_steps_per_second': 24.148, 'total_flos': 4370384095471392.0, 'train_loss': 0.08220917936684428, 'epoch': 30.0})

In [345]:
trainer.save_model(output_dir="./glue_mrpc/lora")

# 在测试集上进行测试

In [333]:
# lora_model
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

# 为了进行比较需要对预测结果进行转换
preds = np.argmax(predictions.predictions, axis=-1)

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

(1725, 2) (1725,)


{'accuracy': 0.8736231884057971, 'f1': 0.9031111111111111}

# merge_model

In [366]:
merge_model = lora_model.merge_and_unload()

test_sentence1 = raw_datasets["test"]["sentence1"]
test_sentence2 = raw_datasets["test"]["sentence2"]

test_inputs = tokenizer(
    test_sentence1, 
    test_sentence2, 
    truncation=True, 
    padding=True, 
    max_length=512,
    return_tensors="pt"
)
test_inputs.to(device)

merge_model.eval()

# 进行预测
with torch.no_grad():
    outputs = merge_model(**test_inputs)

# 获取预测标签
predictions = torch.argmax(outputs.logits, axis=-1)
labels = tokenized_datasets["test"]["labels"]
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions = predictions, references = labels)

{'accuracy': 0.8736231884057971, 'f1': 0.9031111111111111}

In [360]:
from peft import PeftModel

base_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

merge_model = PeftModel.from_pretrained(base_model, "./glue_mrpc/lora")


#base_model
#print()
merge_model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (d

In [363]:
def is_merge(original_weights, merged_weights):
    if not torch.allclose(original_weights, merged_weights):
        print("合并成功，权重发生了变化。")
    else:
        print("权重没有变化，合并失败。")

is_merge(base_model.classifier.dense.weight, merge_model.classifier.dense.weight)

权重没有变化，合并失败。


In [356]:
base_model.classifier.dense.weight = lora_model.classifier.dense.weight
#base_model.classifier.dense.bias = lora_model.classifier.dense.bias
base_model.classifier.out_proj.weight = lora_model.classifier.out_proj.weight
#base_model.classifier.out_proj.bias = lora_model.classifier.out_proj.bias

In [364]:
merge_model = merge_model.merge_and_unload()
merge_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [365]:
def is_merge(original_weights, merged_weights):
    if not torch.allclose(original_weights, merged_weights):
        print("合并成功，权重发生了变化。")
    else:
        print("权重没有变化，合并失败。")

is_merge(base_model.classifier.dense.weight, merge_model.classifier.dense.weight)

权重没有变化，合并失败。


In [367]:
# original_weights = lora_model.roberta.encoder.layer[0].attention.self.query.weight
# merged_weights = merge_model.roberta.encoder.layer[0].attention.self.query.weight
def is_merge(original_weights, merged_weights):
    if not torch.allclose(original_weights, merged_weights):
        print("合并成功，权重发生了变化。")
    else:
        print("权重没有变化，合并失败。")

is_merge(lora_model.roberta.encoder.layer[0].attention.self.query.weight, merge_model.roberta.encoder.layer[0].attention.self.query.weight)

权重没有变化，合并失败。


In [359]:
test_sentence1 = raw_datasets["test"]["sentence1"]
test_sentence2 = raw_datasets["test"]["sentence2"]

test_inputs = tokenizer(
    test_sentence1, 
    test_sentence2, 
    truncation=True, 
    padding=True, 
    max_length=512,
    return_tensors="pt"
)
test_inputs.to(device)

merge_model.eval()

# 进行预测
with torch.no_grad():
    outputs = merge_model(**test_inputs)

# 获取预测标签
predictions = torch.argmax(outputs.logits, axis=-1)
labels = tokenized_datasets["test"]["labels"]
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions = predictions, references = labels)

{'accuracy': 0.8736231884057971, 'f1': 0.9031111111111111}

In [387]:
import torch
torch.cuda.empty_cache()
