In [2]:
import os, sys
import torch
from torch.utils.data import DataLoader, TensorDataset
import datasets
import optuna
from evaluate_metric import accuracy, f1
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DistilBertForSequenceClassification
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType, AutoPeftModelForSequenceClassification, PeftConfig, PeftMixedModel, PromptEncoderConfig, AdaLoraConfig
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-03-14 02:25:35.792451: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-14 02:25:37.566392: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 02:25:37.566433: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 02:25:37.567371: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-14 02:25:37.5

In [1]:
!pip install optuna

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com
Collecting optuna
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/4c/6a/219a431aaf81b3eb3070fd2d58116baa366d3072f43bbcc87dc3495b7546/optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/7f/50/9fb3a5c80df6eb6516693270621676980acd6d5a9a7efdbfa273f8d616c7/alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/f3/18/3e867ab37a24fdf073c1617b9c7830e06ec270b1ea4694a624038fc40a03/colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloadi

In [3]:
model_path = "./distilbert/"
#peft_path = "./distilbert-lora-judge/"
#peft_path = "./best_version/checkpoint-11500"
#model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [40]:
def my_model_init(trial=None):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, ignore_mismatched_sizes=True)
    if trial is not None:
        ada_config = AdaLoraConfig(
        peft_type="ADALORA",
        task_type="SEQ_CLS",
        r=trial.suggest_categorical("r", [4, 8, 16]),
        lora_alpha=trial.suggest_categorical("lora_alpha", [16, 32, 64]),
        target_modules=["q_lin", "v_lin"],
        lora_dropout=trial.suggest_float("lora_dropout", 0.0, 0.1),
        )
    else:
        ada_config = AdaLoraConfig(
        peft_type="ADALORA",
        task_type="SEQ_CLS",
        r=8,
        lora_alpha=32,
        target_modules=["q_lin", "v_lin"],
        lora_dropout= 0.01)
    model = get_peft_model(model, ada_config)
    return model

In [33]:
def my_model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2, ignore_mismatched_sizes=True)
    ada_config = AdaLoraConfig(
        peft_type="ADALORA",
        task_type="SEQ_CLS",
        r=8,
        lora_alpha=32,
        target_modules=["q_lin", "v_lin"],
        lora_dropout=0.005,
    )
    model = get_peft_model(model, ada_config)
    return model

In [5]:
data_dir = "./dataset"   #Bohrium数据集：Finetune-dataset-LLMKG
dataset = load_dataset(
    "json", 
    data_files = {'train': os.path.join(data_dir, 'train_data.jsonl'), 'valid': os.path.join(data_dir, 'valid_data.jsonl')}
    )

Generating train split: 94000 examples [00:00, 417748.74 examples/s]
Generating valid split: 11000 examples [00:00, 90265.38 examples/s]


In [6]:
def tokenizer_func(example):
    example["label"] = [int(item) for item in example["label"]]
    return tokenizer(example["category_description"], example["text"], padding="max_length", truncation=True, max_length=492)

In [7]:
train_dataset = dataset["train"].shuffle().map(tokenizer_func, batched=True)
valid_dataset = dataset["valid"].shuffle().map(tokenizer_func, batched=True)

Map: 100%|██████████| 94000/94000 [00:16<00:00, 5546.54 examples/s]
Map: 100%|██████████| 11000/11000 [00:02<00:00, 5371.85 examples/s]


In [86]:
train_dataset_small = train_dataset.select(range(500))
valid_dataset_small = valid_dataset.select(range(100))

In [82]:
acc_metric = accuracy.Accuracy()
f1_metric = f1.F1()
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    print(acc)
    return acc

In [95]:
training_args = TrainingArguments("./distilbert/training_args.bin")
training_args.run_name = "./outputs0314/experiment_1"
training_args.logging_dir = "./outputs0314/"
training_args.output_dir="./outputs0314/"
training_args.per_device_eval_batch_size=16
training_args.per_device_train_batch_size=16
training_args.num_train_epochs=1
training_args.evaluation_strategy="steps"
training_args.eval_steps=500
training_args.logging_steps=500
training_args.load_best_model_at_end=True

In [97]:
#model.config.pad_token_id = model.config.eos_token_id
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
trainer = Trainer(model_init=my_model_init, 
                  args=training_args, 
                  train_dataset=train_dataset, 
                  eval_dataset=valid_dataset, 
                  data_collator=data_collator,
                  compute_metrics=eval_metric)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert/ and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
def default_hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "weight_decay":trial.suggest_float("weight_decay", 1e-4, 1e-2,log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
        "optim": trial.suggest_categorical("optim", ["sgd", "adamw_hf"]),
    }

best_trials = trainer.hyperparameter_search(backend="optuna", hp_space=default_hp_space_optuna, compute_objective=lambda x: x["eval_f1"], direction="maximize", n_trials=10)

[I 2024-03-14 06:56:42,271] A new study created in memory with name: no-name-ef73daaa-6e44-4f29-9516-6174fe4f004b
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert/ and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
10,1.8788,1.879077,0.75,0


[W 2024-03-14 06:56:49,369] Trial 0 failed with parameters: {'learning_rate': 2.5861386303120133e-06, 'num_train_epochs': 5, 'seed': 5, 'weight_decay': 0.00015385565831258573, 'per_device_train_batch_size': 64, 'optim': 'sgd', 'r': 4, 'lora_alpha': 16, 'lora_dropout': 0.06996632755540107} because of the following error: KeyError('f1').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 199, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1624, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2029, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epo

KeyError: 'f1'

In [93]:
print(best_trials)

BestRun(run_id='2', objective=0.08450704225352113, hyperparameters={'learning_rate': 3.5438361164518976e-06, 'num_train_epochs': 1, 'seed': 31, 'weight_decay': 0.0004407602115615133, 'per_device_train_batch_size': 4, 'optim': 'sgd', 'r': 8, 'lora_alpha': 64, 'lora_dropout': 0.06968277594275944}, run_summary=None)


In [None]:
for n, v in best_trials.hyperparameters.items():
    setattr(trainer.args, n, v)

# 使用找到的最佳超参数重新训练模型
trainer.train()