In [1]:
!pip install accelerate datasets transformers peft

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [15]:
import os, torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import (
    ModelConfig,
    RewardConfig,
    RewardTrainer,
    ScriptArguments,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
    setup_chat_format,
)

# 🚫 禁用 trackio / wandb 等日志
os.environ["DISABLE_TRACKIO"] = "1"
os.environ["WANDB_DISABLED"] = "true"


In [3]:
import trl
import transformers
import torch
print("trl version: ", trl.__version__)
print("transformers version: ", transformers.__version__)
print("torch veion: ", torch.__version__)

trl version:  0.22.2
transformers version:  4.56.0
torch veion:  2.3.1+cu118


In [22]:
model_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/model/QwenQwen2.5-0.5B-Instruct"
save_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/model/trainedppo"
reward_model_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/model/reward"

training_args = RewardConfig(
    output_dir=reward_model_path,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    eval_strategy="steps",
    eval_steps=50,
    max_length=1024,
    logging_strategy="no",             # 🚫 关闭日志
    # eval_strategy="no",                # 🚫 不做 eval（想做就改成 "steps"）
    save_strategy="no",                # 🚫 不自动保存 checkpoint
    report_to="none"                   # 🚫 禁用所有集成
)

model_args = ModelConfig(
    model_name_or_path=model_path,   # 基座模型
    trust_remote_code=True,
    use_peft=False  # 如果要用LoRA再改True
)


In [5]:
# 加载模型和tokenizer
torch_dtype = (
    model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
)

quantization_config = get_quantization_config(model_args)

model_kwargs = dict(
    revision=model_args.model_revision,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
    use_cache=False if training_args.gradient_checkpointing else True,
    torch_dtype=torch_dtype,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    num_labels=1,
    trust_remote_code=model_args.trust_remote_code,
    **model_kwargs
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
)


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at C:/Users/hhm18/Desktop/深度学习/env_DRL/model/QwenQwen2.5-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 对齐 pad_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# 如果是 base 模型，给它加上 chat 模板
if tokenizer.chat_template is None:
    model, tokenizer = setup_chat_format(model, tokenizer)

In [7]:
tokenizer.pad_token, model.config.pad_token_id

('<|im_end|>', 151645)

In [8]:
model.score.weight.shape

torch.Size([1, 896])

In [9]:
from datasets import load_dataset

data_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/data"
dataset = load_dataset(data_path,
                       data_files={
                            "train": "train.jsonl.gz",
                            "test": "test.jsonl.gz"
                       })
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 52421
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 2749
    })
})


In [10]:
train_dataset = dataset["train"].select(range(100))
eval_dataset = dataset["test"].select(range(50))
train_dataset, eval_dataset

(Dataset({
     features: ['chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['chosen', 'rejected'],
     num_rows: 50
 }))

In [20]:
trainer = RewardTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=get_peft_config(model_args),
)


In [21]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss


TrainOutput(global_step=25, training_loss=0.9132211303710938, metrics={'train_runtime': 318.3071, 'train_samples_per_second': 0.314, 'train_steps_per_second': 0.079, 'total_flos': 0.0, 'train_loss': 0.9132211303710938, 'epoch': 1.0})

In [None]:
trainer.save_model(training_args.output_dir)