In [1]:
!pip install torch transformers==4.34.1 datasets peft==0.5.0 accelerate trl bitsandbytes optimum auto-gptq==0.4.2



In [2]:
import torch
from datasets import Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, GPTQConfig
from trl import DPOTrainer

In [3]:
# from huggingface_hub import notebook_login
# notebook_login()

In [4]:
def dpo_data():
    dataset = load_dataset(
        "HuggingFaceH4/ultrafeedback_binarized",
        split="test_prefs",
        use_auth_token=True
    )

    original_columns = dataset.column_names

    def return_prompt_and_responses(samples):
        return {
            "prompt": [prompt for prompt in samples["prompt"]],
            "chosen": samples["chosen"],
            "rejected": samples["rejected"]
        }

    return dataset.map(
        return_prompt_and_responses,
        batched=True,
        remove_columns=original_columns
    )

In [5]:
tokenizer = AutoTokenizer.from_pretrained("TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=GPTQConfig(bits=4, disable_exllama=True)
)

model_ref = AutoModelForCausalLM.from_pretrained(
    "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=GPTQConfig(bits=4, disable_exllama=True)
)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


In [7]:
train_dataset = dpo_data()



In [8]:
train_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 2000
})

In [9]:
train_df = train_dataset.to_pandas()
train_df

Unnamed: 0,prompt,chosen,rejected
0,You are given a sentence in Spanish. Your job ...,[{'content': 'You are given a sentence in Span...,[{'content': 'You are given a sentence in Span...
1,A software engineer is tasked with creating a ...,[{'content': 'A software engineer is tasked wi...,[{'content': 'A software engineer is tasked wi...
2,Develop a comprehensive marketing campaign for...,[{'content': 'Develop a comprehensive marketin...,[{'content': 'Develop a comprehensive marketin...
3,Can you summarize the discussion at the Goat a...,[{'content': 'Can you summarize the discussion...,[{'content': 'Can you summarize the discussion...
4,is it normal to have a fear of flying?,[{'content': 'is it normal to have a fear of f...,[{'content': 'is it normal to have a fear of f...
...,...,...,...
1995,What do I need to do to run terraform import?,[{'content': 'What do I need to do to run terr...,[{'content': 'What do I need to do to run terr...
1996,Suppose we have a website with multiple pages ...,[{'content': 'Suppose we have a website with m...,[{'content': 'Suppose we have a website with m...
1997,Design a 45-minute lesson plan that highlights...,[{'content': 'Design a 45-minute lesson plan t...,[{'content': 'Design a 45-minute lesson plan t...
1998,Given the text: Corbel in the Peachtree design...,[{'content': 'Given the text: Corbel in the Pe...,[{'content': 'Given the text: Corbel in the Pe...


In [10]:
train_df["chosen"] = train_df["chosen"].apply(lambda x: x[1]["content"])
train_df["rejected"] = train_df["rejected"].apply(lambda x: x[1]["content"])

In [11]:
train_df = train_df.dropna()

In [12]:
df = train_df.sample(len(train_df))
train_df = df[:200]
train_df.reset_index(drop=True, inplace=True)
val_df = df[200:250]
val_df.reset_index(drop=True, inplace=True)

In [13]:
train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

In [14]:
train_data

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 200
})

In [15]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (rotary_emb): MistralRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): MistralMLP(
          (act_fn): SiLUActivation()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32002, bias=False)
)

In [16]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)
peft_config.inference_mode = False

In [17]:
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.gradient_checkpointing_enable()
model.config.pretraining_tp = 1
model = get_peft_model(model, peft_config)

In [18]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32002, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (rotary_emb): MistralRotaryEmbedding()
              (k_proj): QuantLinear()
              (o_proj): QuantLinear()
              (q_proj): QuantLinear(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (quant_linear_module): Quan

In [19]:
model_ref = prepare_model_for_kbit_training(model_ref)
model_ref.config.use_cache = False
model_ref.gradient_checkpointing_enable()
model_ref.config.pretraining_tp = 1
model_ref = get_peft_model(model_ref, peft_config)

In [23]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    max_steps=50,
    remove_unused_columns=False,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=10,
    logging_first_step=True,
    logging_steps=10,
    output_dir="dpo-mistral",
    optim="paged_adamw_8bit",
    warmup_steps=2,
    fp16=True,
    report_to="none"
)

In [24]:
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=0.1,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    max_length=512,
    max_target_length=256,
    max_prompt_length=256
)

In [25]:
dpo_trainer.train()

Step,Training Loss,Validation Loss


OutOfMemoryError: ignored

In [None]:
from transformers import GenerationConfig

inputs = tokenizer(
    """I have dropped my phone in water. Now it is not working what should I do?""",
    return_tensors="pt"
).to("cuda")

model = AutoPeftModelForCausalLM.from_pretrained(
    "",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda"
)

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_token=256,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
import time
st_time = time.time()
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(time.time() - st_time)