### Dataset & Model Preparation

In [None]:
# sft model download
# execute the following commands in shell or you will meet network problems
!export HF_ENDPOINT=https://hf-mirror.com
!huggingface-cli download --resume-download facebook/opt-350m --local-dir ./model/sft_model/

In [None]:
# dataset download
!huggingface-cli download --repo-type dataset --resume-download Dahoas/rm-static --local-dir ./data/rm_static

In [None]:
# rm model download
!huggingface-cli download --resume-download facebook/opt-125m --local-dir ./model/rm_model/

### Step 1: SFT training
adapting from https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py

In [1]:
# initialize the device
import torch
from deepspeed import get_accelerator
device = torch.device(get_accelerator().device_name())
device

[2025-03-11 18:36:13,565] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [None]:
# initialize deepspeed config
from src.utils import get_train_ds_config
import deepspeed
deepspeed.init_distributed()
global_rank = torch.distributed.get_rank()
ds_config = get_train_ds_config(offload=False,
                                    dtype='fp16',
                                    stage=0,
                                    enable_tensorboard=True,
                                    tb_path="step1_tensorboard",
                                    tb_name="step1_model")
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
ds_config['train_micro_batch_size_per_gpu'] = per_device_train_batch_size
ds_config['train_batch_size'] = per_device_train_batch_size * torch.distributed.get_world_size() * gradient_accumulation_steps
torch.distributed.barrier()

[2025-03-11 18:36:20,912] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-03-11 18:36:20,914] [INFO] [comm.py:673:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-03-11 18:36:21,053] [INFO] [comm.py:728:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.17.0.9, master_port=29500
[2025-03-11 18:36:21,055] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


[rank0]:[W311 18:36:21.654211060 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.


In [3]:
# initialize tokenizer
import os
import json
from transformers import AutoTokenizer
model_name_or_path = "./model/sft_model"
model_json = os.path.join(model_name_or_path, "config.json")
if os.path.exists(model_json):
    model_json_file = json.load(open(model_json))
    model_name = model_json_file.get("_name_or_path",
                                        model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
tokenizer

GPT2TokenizerFast(name_or_path='./model/sft_model', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [4]:
# initialize sft model
from transformers import AutoConfig, AutoModelForCausalLM
import math
model_name_or_path = "./model/sft_model"
model_config = AutoConfig.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            from_tf=bool(".ckpt" in model_name_or_path),
            config=model_config)
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0)))
model


OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [5]:
# convert model to lora form for efficient sft
from src.lora import convert_linear_layer_to_lora
lora_module_name = "decoder.layers."
lora_dim = 128
model = convert_linear_layer_to_lora(model, lora_module_name, lora_dim)
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (v_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (q_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (out_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): LinearLayer_LoRA(


In [6]:
 # Prepare the data
from src.data_utils import create_prompt_dataset
train_phase = 1
local_rank = -1
data_path = ["./data/rm_static"]
data_split = "2,4,4"
data_output_path = "./data/rm_static_processed4sft/"
max_seq_len = 512
train_dataset, eval_dataset = create_prompt_dataset(
    local_rank,
    data_path,
    data_split,
    data_output_path,
    train_phase,
    1234,
    tokenizer,
    max_seq_len,
    end_of_conversation_token=tokenizer.eos_token)
train_dataset

Creating prompt dataset ['./data/rm_static'], reload=False
Creating dataset Dahoas_rm_static for train_phase=1 size=15252
Creating dataset Dahoas_rm_static for train_phase=1 size=1021


<src.data_utils.PromptDataset at 0x7f03d88fc3d0>

In [None]:
# create dataloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import default_data_collator
train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)
train_dataloader = DataLoader(train_dataset,
                                  collate_fn=default_data_collator,
                                  sampler=train_sampler,
                                  batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset,
                                collate_fn=default_data_collator,
                                sampler=eval_sampler,
                                batch_size=per_device_eval_batch_size)