In [1]:
import os, sys
import torch
import datasets
import evaluate
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    default_data_collator,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from accelerate import notebook_launcher
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-03-12 10:35:20.119511: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-12 10:35:22.044923: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 10:35:22.044966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 10:35:22.046127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-12 10:35:22.0

In [2]:
#model_path = "./distilbert/"
model_path = "./distilbert/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
'''
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
'''
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    #quantization_config=bnb_config,
    num_labels=2,
    #device_map="auto",
    ignore_mismatched_sizes=True
)

  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert/ and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
data_dir = "./dataset"   #Bohrium数据集：Finetune-dataset-LLMKG
dataset = load_dataset(
    "json", 
    data_files = {'train': os.path.join(data_dir, 'train_data.jsonl'), 'valid': os.path.join(data_dir, 'valid_data.jsonl')}
    )

In [6]:
prompt_template = "以下是任务的指引和具体描述。\
    判断描述是否满足任务的指引.\
    \n\n### 指引:\n{instruction}\n\n### 描述:\n{input}\n"
def generate_prompt(instruction, inputs, prompt_template=prompt_template):
    res = prompt_template.format(
            instruction=instruction, input=inputs)
    return res
def tokenizer_func(example):
    example["label"] = [int(item) for item in example["label"]]
    prompt = generate_prompt(example["category_description"], example["text"])
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=512)
    
train_dataset = dataset["train"].shuffle().map(tokenizer_func)
valid_dataset = dataset["valid"].shuffle().map(tokenizer_func)

Map: 100%|██████████| 94000/94000 [01:11<00:00, 1308.14 examples/s]
Map: 100%|██████████| 11000/11000 [00:08<00:00, 1304.00 examples/s]


In [11]:
print(train_dataset)

Dataset({
    features: ['category_description', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 94000
})


In [None]:
print(model)

In [8]:
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q_lin","v_lin"],        #这里表示对注意力机制中的query和value两个权重矩阵进行low rank
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_CLS   #CAUSAL_LM、FEATURE_EXTRACTION、QUESTION_ANS、SEQ_2_SEQ_LM、SEQ_CLS 和 TOKEN_CLS
) 
model = prepare_model_for_kbit_training(model)   #如果使用量化技术载入的模型，需要额外进行一步
peft_model = get_peft_model(model, lora_config)

In [9]:
training_args = TrainingArguments("./distilbert/training_args.bin")

In [None]:
print(training_args)

In [15]:
training_args.run_name = "./outputs0310/experiment_1"
training_args.logging_dir = "./outputs0310/"
training_args.output_dir="./outputs0310/"
training_args.per_device_eval_batch_size=64
training_args.per_device_train_batch_size=64
training_args.num_train_epochs=8

In [16]:
model.config.pad_token_id = model.config.eos_token_id
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
)
trainer.train()

model.save_pretrained("distilbert-lora-judge")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.0634
1000,0.0546
1500,0.0516
2000,0.0465
2500,0.0391
3000,0.0414
3500,0.0354
4000,0.0323
4500,0.032
5000,0.0265


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

