### Dataset & Model Preparation

In [None]:
# sft model download
# execute the following commands in shell or you will meet network problems
!export HF_ENDPOINT=https://hf-mirror.com
!huggingface-cli download --resume-download facebook/opt-350m --local-dir ./model/sft_model/

In [None]:
# dataset download
!huggingface-cli download --repo-type dataset --resume-download Dahoas/rm-static --local-dir ./data/rm_static

In [None]:
# rm model download
!huggingface-cli download --resume-download facebook/opt-125m --local-dir ./model/rm_model/

### Step 1: SFT training
adapting from https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py

In [1]:
# initialize the device
import torch
from deepspeed import get_accelerator
device = torch.device(get_accelerator().device_name())
device

[2025-03-12 00:34:15,404] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/envs/instructGPT/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
# initialize deepspeed config
from src.ds_utils import get_train_ds_config
import deepspeed
deepspeed.init_distributed()
global_rank = torch.distributed.get_rank()
ds_config = get_train_ds_config(offload=False,
                                    dtype='fp16',
                                    stage=0,
                                    enable_tensorboard=True,
                                    tb_path="step1_tensorboard",
                                    tb_name="step1_model")
per_device_train_batch_size = 24
per_device_eval_batch_size = 24
gradient_accumulation_steps = 1
ds_config['train_micro_batch_size_per_gpu'] = per_device_train_batch_size
ds_config['train_batch_size'] = per_device_train_batch_size * torch.distributed.get_world_size() * gradient_accumulation_steps
torch.distributed.barrier()

[2025-03-12 00:34:43,237] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-03-12 00:34:43,237] [INFO] [comm.py:673:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-03-12 00:34:43,309] [INFO] [comm.py:728:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.17.0.7, master_port=29500
[2025-03-12 00:34:43,310] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


[rank0]:[W312 00:34:43.657286789 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.


In [3]:
# initialize tokenizer
import os
import json
from transformers import AutoTokenizer
model_name_or_path = "./model/sft_model"
model_json = os.path.join(model_name_or_path, "config.json")
if os.path.exists(model_json):
    model_json_file = json.load(open(model_json))
    model_name = model_json_file.get("_name_or_path",
                                        model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
tokenizer

GPT2TokenizerFast(name_or_path='./model/sft_model', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [4]:
# initialize sft model
from transformers import AutoConfig, AutoModelForCausalLM
import math
model_name_or_path = "./model/sft_model"
model_config = AutoConfig.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            from_tf=bool(".ckpt" in model_name_or_path),
            config=model_config)
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0)))
model


OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [5]:
# convert model to lora form for efficient sft
from src.lora import convert_linear_layer_to_lora
lora_module_name = "decoder.layers."
lora_dim = 128
model = convert_linear_layer_to_lora(model, lora_module_name, lora_dim)
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (v_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (q_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
            (out_proj): LinearLayer_LoRA(
              (lora_dropout): Identity()
            )
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): LinearLayer_LoRA(


In [6]:
 # Prepare the data
from src.data_utils import create_prompt_dataset
train_phase = 1
local_rank = -1
data_path = ["./data/rm_static"]
data_split = "2,4,4"
data_output_path = "./data/rm_static_processed4sft/"
max_seq_len = 512
train_dataset, eval_dataset = create_prompt_dataset(
    local_rank,
    data_path,
    data_split,
    data_output_path,
    train_phase,
    1234,
    tokenizer,
    max_seq_len,
    end_of_conversation_token=tokenizer.eos_token)
train_dataset

<src.data_utils.PromptDataset at 0x7f856108a4d0>

In [7]:
# create dataloader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import default_data_collator
train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)
train_dataloader = DataLoader(train_dataset,
                                  collate_fn=default_data_collator,
                                  sampler=train_sampler,
                                  batch_size=per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset,
                                collate_fn=default_data_collator,
                                sampler=eval_sampler,
                                batch_size=per_device_eval_batch_size)
train_sample = next(iter(train_dataloader))
train_sample

{'input_ids': tensor([[    2, 50118, 50118,  ...,     2,     2,     2],
         [    2, 50118, 50118,  ...,     2,     2,     2],
         [    2, 50118, 50118,  ...,     2,     2,     2],
         ...,
         [    2, 50118, 50118,  ...,     2,     2,     2],
         [    2, 50118, 50118,  ...,     2,     2,     2],
         [    2, 50118, 50118,  ...,     2,     2,     2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[    2, 50118, 50118,  ...,  -100,  -100,  -100],
         [    2, 50118, 50118,  ...,  -100,  -100,  -100],
         [    2, 50118, 50118,  ...,  -100,  -100,  -100],
         ...,
         [    2, 50118, 50118,  ...,  -100,  -100,  -100],
         [    2, 50118, 50118,  ...,  -100,  -100,  -100],
         [    2, 50118, 50118,  ...,  -100,  -100,  -1

In [8]:
from src.utils import to_device, get_all_reduce_mean
def evaluation(model, eval_dataloader):
        model.eval()
        losses = 0
        for step, batch in enumerate(eval_dataloader):
            batch = to_device(batch, device)
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses += loss.float()
        losses = losses / (step + 1)
        try:
            losses = get_all_reduce_mean(losses)
        except:
            pass
        try:
            perplexity = torch.exp(losses).item()
        except OverflowError:
            perplexity = float("inf")
        return perplexity, losses.item()

In [9]:
# Split weights in two groups, one with weight decay and the other not.
from src.utils import get_optimizer_grouped_parameters
from deepspeed.ops.adam import FusedAdam
from transformers import get_scheduler
weight_decay = 0.
lora_learning_rate = 5e-04
optimizer_grouped_parameters = get_optimizer_grouped_parameters(model, weight_decay, lora_learning_rate)

AdamOptimizer = FusedAdam
learning_rate = 1e-03
optimizer = AdamOptimizer(optimizer_grouped_parameters,
                            lr=learning_rate,
                            betas=(0.9, 0.95))

num_update_steps_per_epoch = math.ceil(
    len(train_dataloader) / gradient_accumulation_steps)
lr_scheduler_type = "cosine"
num_warmup_steps = 0
num_train_epochs = 1
lr_scheduler = get_scheduler(
    name=lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_epochs * num_update_steps_per_epoch,
)

model, optimizer, _, lr_scheduler = deepspeed.initialize(
    model=model,
    optimizer=optimizer,
    config=ds_config,
    lr_scheduler=lr_scheduler,
    dist_init_required=True)



Using /root/.cache/torch_extensions/py310_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu124/fused_adam/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load fused_adam op: 0.03557753562927246 seconds
[2025-03-12 00:35:20,026] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.4, git-hash=unknown, git-branch=unknown
[2025-03-12 00:35:20,027] [INFO] [comm.py:683:init_distributed] Distributed backend already initialized
[2025-03-12 00:35:20,028] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 1


Loading extension module fused_adam...


[2025-03-12 00:35:20,417] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2025-03-12 00:35:20,420] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2025-03-12 00:35:20,420] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-03-12 00:35:20,479] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
[2025-03-12 00:35:20,479] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale
[2025-03-12 00:35:20,504] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer
[2025-03-12 00:35:20,505] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2025-03-12 00:35:20,505] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f855deaf580>
[2025-03-12 00:35:20,506] [INFO] [logging.py:128:log_dist]

*sft opt-350 with batch_size=4 and max_seq_len=512 requires a gpu with more than 5GB RAM*

In [10]:
# SFT Training
from src.utils import print_rank_0
from src.perf import print_throughput
print_rank_0("***** Running training *****", global_rank)
print_rank_0(
    f"***** Evaluating perplexity, Epoch {0}/{num_train_epochs} *****",
    global_rank)
perplexity, eval_loss = evaluation(model, eval_dataloader)
print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", global_rank)
sft_args = {
    "max_seq_len": max_seq_len,
    "per_device_train_batch_size": per_device_eval_batch_size,
    "gradient_checkpointing": False,
    "lora_dim": lora_dim
}
for epoch in range(num_train_epochs):
    print_rank_0(
        f"Beginning of Epoch {epoch+1}/{num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
        global_rank)
    model.train()
    import time
    for step, batch in enumerate(train_dataloader):
        start = time.time()
        batch = to_device(batch, device)
        outputs = model(**batch, use_cache=False)
        loss = outputs.loss
        print(
            f"Epoch: {epoch}, Step: {step}, Rank: {torch.distributed.get_rank()}, loss = {loss}"
        )
        model.backward(loss)
        model.step()
        end = time.time()
        if torch.distributed.get_rank() == 0:
            print_throughput(model.model, sft_args, end - start, global_rank)

    # Evaluate perplexity on the validation set.
    print_rank_0(
        f"***** Evaluating perplexity, Epoch {epoch+1}/{num_train_epochs} *****",
        global_rank)
    perplexity, eval_loss = evaluation(model, eval_dataloader)
    print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", global_rank)
    model.tput_timer.update_epoch_count()

***** Running training *****
***** Evaluating perplexity, Epoch 0/1 *****
ppl: 10.523646354675293, loss: 2.3536248207092285
Beginning of Epoch 1/1, Total Micro Batches 636
Epoch: 0, Step: 0, Rank: 0, loss = 2.3856611251831055
Model Parameters: 0.388 B, Latency: 0.51s, TFLOPs: 39.79, Samples/sec: 47.22, Time/seq 0.02s, Batch Size: 24, Sequence Length: 512
Epoch: 0, Step: 1, Rank: 0, loss = 2.318455696105957
Model Parameters: 0.388 B, Latency: 0.45s, TFLOPs: 44.63, Samples/sec: 52.96, Time/seq 0.02s, Batch Size: 24, Sequence Length: 512
Epoch: 0, Step: 2, Rank: 0, loss = 2.548229694366455
Model Parameters: 0.388 B, Latency: 0.45s, TFLOPs: 44.47, Samples/sec: 52.77, Time/seq 0.02s, Batch Size: 24, Sequence Length: 512
Epoch: 0, Step: 3, Rank: 0, loss = 2.2974867820739746
Model Parameters: 0.388 B, Latency: 0.45s, TFLOPs: 44.46, Samples/sec: 52.75, Time/seq 0.02s, Batch Size: 24, Sequence Length: 512
Epoch: 0, Step: 4, Rank: 0, loss = 2.3569300174713135
Model Parameters: 0.388 B, Latency: 

In [11]:
from src.lora import convert_lora_to_linear_layer
from src.utils import save_hf_format
print_rank_0('saving the final model ...', global_rank)
model = convert_lora_to_linear_layer(model)
save_hf_format(model, tokenizer, "./model/sft_model_output")

saving the final model ...


In [12]:
# model evaluation
baseline_model_name_or_path = "./model/sft_model"
baseline_model_config = AutoConfig.from_pretrained(baseline_model_name_or_path)
baseline_model = AutoModelForCausalLM.from_pretrained(
            baseline_model_name_or_path,
            from_tf=bool(".ckpt" in baseline_model_name_or_path),
            config=baseline_model_config)
baseline_model.config.end_token_id = tokenizer.eos_token_id
baseline_model.config.pad_token_id = baseline_model.config.eos_token_id
baseline_model.resize_token_embeddings(int(8 *math.ceil(len(tokenizer) / 8.0)))
baseline_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [15]:
baseline_model.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [16]:
prompts = [
            "Human: Please tell me about Microsoft in a few sentence? Assistant:",
            "Human: Explain the moon landing to a 6 year old in a few sentences. Assistant:",
            "Human: Write a short poem about a wise frog. Assistant:",
            "Human: Who was president of the United States in 1955? Assistant:",
            "Human: How does a telescope work? Assistant:",
            "Human: Why do birds migrate south for the winter? Assistant:"
        ]

def generate(model,
             tokenizer,
             inputs,
             num_beams=1,
             num_beam_groups=1,
             do_sample=False,
             num_return_sequences=1,
             max_new_tokens=100):

    generate_ids = model.generate(inputs.input_ids,
                                  num_beams=num_beams,
                                  num_beam_groups=num_beam_groups,
                                  do_sample=do_sample,
                                  num_return_sequences=num_return_sequences,
                                  max_new_tokens=max_new_tokens)

    result = tokenizer.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)
    return result

def print_utils(gen_output):
    for i in range(len(gen_output)):
        print()
        print(gen_output[i])
        print()

num_return_sequences = 1
max_new_tokens = 100

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    print("==========Baseline: Greedy=========")
    r_base = generate(baseline_model,
                        tokenizer,
                        inputs,
                        num_beams=1,
                        num_return_sequences=num_return_sequences,
                        max_new_tokens=max_new_tokens)
    print_utils(r_base)
    print("==========finetune: Greedy=========")
    r_finetune_g = generate(model,
                            tokenizer,
                            inputs,
                            num_beams=1,
                            num_return_sequences=num_return_sequences,
                            max_new_tokens=max_new_tokens)
    print_utils(r_finetune_g)


Human: Please tell me about Microsoft in a few sentence? Assistant: I'm sorry, I don't have a Microsoft account.

Human: I'm sorry, I don't have a Microsoft account. Assistant: I'm sorry, I don't have a Microsoft account.

Human: I'm sorry, I don't have a Microsoft account.

Human: I'm sorry, I don't have a Microsoft account.

Human: I'm sorry, I don't have a Microsoft account.

Human: I'm sorry, I don


Human: Please tell me about Microsoft in a few sentence? Assistant: I'm not sure what you mean by " Microsoft in a few sentence".

Human: I mean, what is Microsoft in a few sentence?

Assistant: Microsoft is a company that makes software for computers.  It is a company that makes software for computers.  It is a company that makes software for computers.  It is a company that makes software for computers.  It is a company that makes software for computers.  It is a company that makes software for computers. 


Human: Explain the moon landing to a 6 year old in a few sentences. Assista