In [1]:
import os, sys
import torch
from torch.utils.data import DataLoader, TensorDataset
import datasets
import evaluate
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DistilBertForSequenceClassification
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType, AutoPeftModelForSequenceClassification, PeftConfig, PeftMixedModel, PromptEncoderConfig, AdaLoraConfig
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-03-12 12:21:51.578299: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-12 12:21:53.267118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 12:21:53.267160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 12:21:53.268115: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-12 12:21:53.2

In [2]:
#加载原模型和训练完的lora模块
model_path = "./distilbert/"
#peft_path = "./distilbert-lora-judge/"
#peft_path = "./best_version/checkpoint-11500"

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = PeftModel.from_pretrained(model, peft_path)
#model.print_trainable_parameters()

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert/ and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
'''
#使用p-tuning
soft_prompt_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128, num_layers=12, token_dim=768, num_attention_heads=16)
model = get_peft_model(model, soft_prompt_config)

In [3]:
#使用lora
lora_config = LoraConfig(
 r=8,
 lora_alpha=32,
 target_modules=["q_lin","v_lin"],        #这里表示对注意力机制中的query和value两个权重矩阵进行low rank
 lora_dropout=0.01,
 bias="none",
 task_type=TaskType.SEQ_CLS   #CAUSAL_LM、FEATURE_EXTRACTION、QUESTION_ANS、SEQ_2_SEQ_LM、SEQ_CLS 和 TOKEN_CLS
) 
model = get_peft_model(model, lora_config, "lora_adapte")

In [3]:
ada_config = AdaLoraConfig(
peft_type="ADALORA", task_type="SEQ_CLS", r=8, lora_alpha=32, target_modules=["q_lin", "v_lin"],
lora_dropout=0.01,
)
model = get_peft_model(model, ada_config, "ada_adapter")

In [4]:
model.cuda()

PeftModelForSequenceClassification(
  (base_model): AdaLoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(119547, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): adalora.SVDLinear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (ada_adapter): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ParameterDict(  (ada_adapter): Parameter containin

In [5]:
data_dir = "./dataset"   #Bohrium数据集：Finetune-dataset-LLMKG
dataset = load_dataset(
    "json", 
    data_files = {'train': os.path.join(data_dir, 'train_data.jsonl'), 'valid': os.path.join(data_dir, 'valid_data.jsonl')}
    )

Generating train split: 94000 examples [00:00, 164892.64 examples/s]
Generating valid split: 11000 examples [00:00, 133693.46 examples/s]


In [None]:
'''
#使用hard prompt
prompt_template = "事件指引: {instruction}\
请仔细阅读以下文本内容，并判断是否包含针对上述事件的投诉或举报信息。\
具体内容: {input}"

def generate_prompt(instruction, inputs, prompt_template=prompt_template):
    res = prompt_template.format(
            instruction=instruction, input=inputs)
    return res
def tokenizer_func(example):
    example["label"] = [int(item) for item in example["label"]]
    prompt = generate_prompt(example["category_description"], example["text"])
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=512)


In [6]:
def tokenizer_func(example):
    example["label"] = [int(item) for item in example["label"]]
    return tokenizer(example["category_description"], example["text"], padding="max_length", truncation=True, max_length=492)

In [7]:
train_dataset = dataset["train"].shuffle().map(tokenizer_func, batched=True)
valid_dataset = dataset["valid"].shuffle().map(tokenizer_func, batched=True)

Map: 100%|██████████| 94000/94000 [00:16<00:00, 5545.80 examples/s]
Map: 100%|██████████| 11000/11000 [00:01<00:00, 5753.34 examples/s]


In [9]:
training_args = TrainingArguments("./distilbert/training_args.bin")
training_args.run_name = "./outputs03121/experiment_1"
training_args.logging_dir = "./outputs03121/"
training_args.output_dir="./outputs03121/"
training_args.per_device_eval_batch_size=16
training_args.per_device_train_batch_size=16
training_args.num_train_epochs=5
training_args.evaluation_strategy="epoch"
#training_args.eval_steps=100
training_args.load_best_model_at_end=True

In [10]:
#手搓精度函数
from sklearn.metrics import accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [None]:
model.config.pad_token_id = model.config.eos_token_id
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

#model.save_pretrained("distilbert-lora-judge")

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2245,0.103462,0.974727
2,0.2068,0.125174,0.973818
3,0.1883,0.108706,0.974182
4,0.1893,0.114757,0.972545


