In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/ai agent trainning/train/reward')


import sys
import os
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q bitsandbytes accelerate peft trl==0.15.2 datasets

In [None]:
import yaml

with open('../configs/base.yaml', 'r') as file:
    configs = yaml.safe_load(file)

with open('../configs/reward.yaml', 'r') as file:
    configs.update(yaml.safe_load(file))

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification
from transformers import AutoTokenizer

# backbone
backbone = AutoPeftModelForCausalLM.from_pretrained(
    configs['llm_backbone'],
    load_in_4bit = configs['model']['load_in_4bit'],
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(configs['llm_backbone'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


# Load data

In [None]:
import json

with open("../../data/reward/health-care-reward.json", "r") as f:
    data = json.load(f)
len(data)

3329

In [None]:
from datasets import Dataset
import random

random.shuffle(data)
dataset = Dataset.from_list(data)

# Config model

In [None]:
from trl import RewardTrainer, RewardConfig
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    # task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=configs['model']['r'],
    lora_alpha=configs['model']['lora_alpha'],
    lora_dropout=configs['model']['lora_dropout'],
    target_modules =configs['model']['target_modules'],
    bias = configs['model']['bias'],

    use_rslora = configs['model']['use_rslora'],
    # loftq_config = configs['model']['loftq_config'],
)

training_args = RewardConfig(
    per_device_train_batch_size=configs['trainning']['per_device_train_batch_size'],
    gradient_accumulation_steps=configs['trainning']['gradient_accumulation_steps'],
    bf16=True, # not fb16
    fp16=False, # not bf16
    logging_steps=configs['trainning']['logging_steps'],

    # num_train_epochs=configs['trainning']['num_train_epochs'],
    max_steps = configs['trainning']['max_steps'],

    learning_rate=configs['trainning']['learning_rate'],
    weight_decay=configs['trainning']['weight_decay'],
    report_to=configs['trainning']['report_to'],
    seed = configs['trainning']['seed'],
    warmup_steps = configs['trainning']['warmup_steps'],
    optim = configs['trainning']['optim'],
    lr_scheduler_type = configs['trainning']['lr_scheduler_type'],
    output_dir = configs['trainning']['output_dir'],
    )

# Train

In [None]:
from peft import get_peft_model

In [None]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import BertForSequenceClassification

In [None]:
# class LMBackbone(nn.Module):
#     def __init__(self, model):
#         super(LMBackbone, self).__init__()
#         self.backbone = model.base_model.model.model
#         self.base_model_prefix = 'backbone'

#     def forward(self, input_ids, attention_mask=None, **kwargs):
#         x = self.backbone(input_ids, attention_mask=attention_mask, **kwargs)
#         x.hidden_states = torch.stack([x.last_hidden_state])
#         return x

# class RewardHead(nn.Module):
#     def __init__(self, hidden_size, dtype, train_ppo=False, dropout_rate=0.1):
#         super(RewardHead, self).__init__()
#         self.train_ppo = train_ppo

#         self.pooler = nn.Sequential(
#             nn.Linear(hidden_size, hidden_size, dtype=dtype),
#             nn.Dropout(dropout_rate)
#         )
#         self.classifier = nn.Sequential(
#             nn.Linear(hidden_size, 1, dtype=dtype),
#             nn.Dropout(dropout_rate)
#         )

#     def forward(self, last_hidden_state, attention_mask=None, **kwargs):
#         seq_len = last_hidden_state.size(1)
#         pooling = self.pooler(last_hidden_state)
#         # Áp dụng attention mask để loại bỏ các padding token
#         if attention_mask is not None:
#             attention_mask = attention_mask.unsqueeze(-1)
#             pooling = pooling * attention_mask
#             sum_hidden = torch.sum(pooling, dim=1)
#             token_count = torch.sum(attention_mask, dim=1)
#             # Tránh chia cho 0
#             token_count = torch.clamp(token_count, min=1e-9)

#             mean_pooled = sum_hidden / token_count
#         else:
#             mean_pooled = torch.mean(pooling, dim=1)

#         x = self.classifier(mean_pooled)
#         if self.train_ppo:
#             x = x.repeat(1, seq_len)
#         return x

# class RewardModel(torch.nn.Module):
#     def __init__(self, model, train_ppo=False):
#         super(RewardModel, self).__init__()

#         self.backbone = LMBackbone(model)
#         self.base_model_prefix = 'backbone'
#         self.hidden_size = model.config.hidden_size
#         self.torch_dtype = model.config.torch_dtype

#         self.reward_head = RewardHead(self.hidden_size, self.torch_dtype, train_ppo)

#         self.to(device)

#     def forward(self, input_ids, attention_mask=None, **kwargs):
#         x = self.backbone(input_ids, attention_mask=attention_mask, **kwargs)
#         outputs = self.reward_head(x.last_hidden_state)
#         return SequenceClassifierOutput(
#             loss=None,
#             logits=outputs,
#             hidden_states=outputs.hidden_states if hasattr(outputs, "hidden_states") else None,
#             attentions=outputs.attentions if hasattr(outputs, "attentions") else None,
#         )

#     def score(self, last_hidden_state):
#         reward_score = self.reward_head(last_hidden_state)
#         return reward_score

In [None]:
from reward import RewardModel

reward_model = RewardModel(backbone)
reward_model.warnings_issued = backbone.warnings_issued

In [None]:
# Initialize RewardTrainer
trainer = RewardTrainer(
    model=reward_model,
    train_dataset=dataset,
    processing_class=tokenizer,

    args=training_args,
    peft_config=peft_config,
)
for param in trainer.model.reward_head.parameters():
    param.requires_grad = True

trainer.train()



Map:   0%|          | 0/3329 [00:00<?, ? examples/s]

Map:   0%|          | 0/3329 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3329 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.9203
2,3.1093
3,2.9938
4,3.1503
5,2.3519
6,5.7453
7,2.8042
8,2.4246
9,3.2782
10,1.8292


TrainOutput(global_step=60, training_loss=2.703054309884707, metrics={'train_runtime': 1190.8609, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.05, 'total_flos': 0.0, 'train_loss': 2.703054309884707, 'epoch': 0.14414414414414414})

In [None]:
tokenizer.save_pretrained(configs['pretrain_model'])
trainer.model.save_pretrained(configs['pretrain_model'])
torch.save(trainer.model.reward_head.state_dict(), configs['pretrain_model']+"/reward_head.pt")