In [1]:
import time
import threading
import subprocess
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import HfApi
from pathlib import Path
from trl.trainer.utils import DPODataCollatorWithPadding
from datasets import load_dataset, Features, Value

import gc
import transformers
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import bitsandbytes as bnb

In [2]:
peft_model_name = "GingerBled/qwen3-0.6B-FullFineTune" # The model obtained after the SFT step
new_model = "new_Qwen_DPO" #the name of the DPO trained model

PREPROCESSING = True

BATCH_SIZE = 8
MAX_STEPS = 50
GRAD_ACCUM_STEPS = 8
LEARNING_RATE = 5e-5
LOGGING_STEPS = 50
WARMUP_STEPS = 5
BETA = 0.1
MAX_PROMPT_LENGTH = 512
MAX_LENGTH = 1024

In [3]:
# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(peft_model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [4]:
# Preprocessing functions
def keep_if_under_token_limit(example, tokenizer, threshold=MAX_LENGTH):
    total_len = len(tokenizer(example["prompt"] + example["chosen"])["input_ids"])
    return total_len <= threshold

def find_long_examples(dataset, tokenizer, threshold=1024):
    for i, ex in enumerate(dataset):
        total_len = len(tokenizer(ex["prompt"] + ex["chosen"])["input_ids"])
        if total_len > threshold:
            print(f"⚠️ Example {i} too long: {total_len} tokens")
columns_to_keep = ["prompt", "chosen", "rejected"]

In [5]:
# Load dataset safely with defined schema and preprocess
dataset = load_dataset(
    "GingerBled/MNLP_M2_dpo_dataset",
    data_files="MNLP_M2_dpo_dataset.jsonl",
    split="train"
)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
if PREPROCESSING:
    dataset = dataset.filter(lambda ex: keep_if_under_token_limit(ex, tokenizer))
    find_long_examples(dataset, tokenizer)
# Shuffle and split into train/test sets
split_dataset = dataset.shuffle(seed=2025).train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

Filter:   0%|          | 0/20354 [00:00<?, ? examples/s]

In [6]:
# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'v_proj', 'q_proj', 'dense']
)

# Load the base model with BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    peft_model_name,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.gradient_checkpointing_enable()
model.config.use_cache = False

In [8]:
# Initialize Training arguments
training_args = DPOConfig(
    per_device_train_batch_size=BATCH_SIZE,
    max_steps=MAX_STEPS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    gradient_checkpointing=True,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    save_strategy="no",
    logging_steps=LOGGING_STEPS,
    output_dir=new_model,  # Assuming new_model is defined elsewhere
    optim="paged_adamw_32bit",
    warmup_steps=WARMUP_STEPS,
    remove_unused_columns=False,
    beta=BETA,
    max_prompt_length=MAX_PROMPT_LENGTH,
    max_length=MAX_LENGTH
)

# Initialize DPO Trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    peft_config=peft_config
)

Extracting prompt in train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
torch.cuda.empty_cache()
dpo_trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 5.93 GiB. GPU 0 has a total capacity of 19.50 GiB of which 4.07 GiB is free. Process 285714 has 19.41 GiB memory in use. Process 1100962 has 2.09 GiB memory in use. Process 1190612 has 1.89 GiB memory in use. Process 1289902 has 15.41 GiB memory in use. Of the allocated memory 13.86 GiB is allocated by PyTorch, and 1.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
os.environ["HF_TOKEN"] =''
!huggingface-cli login --token $HF_TOKEN

In [None]:
# ==== Save the model ====
print("Saving model...")
trainer.save_model("DPO_model_local_new")
tokenizer.save_pretrained("DPO_model_local_new")

# Create merged model
print("Creating merged model...")
merged_model = model

# Save merged model to new directory
FULL_DIR = "qwen3-0.6B-DPO-new"
merged_model.save_pretrained(FULL_DIR, safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained(FULL_DIR)

# Define repo name and upload location
api = HfApi()
username = api.whoami()["name"]
ORG = "GingerBled"
REPO = "qwen3-0.6B-DPO-new"
FULL_ID = f"{ORG}/{REPO}"

# Create repo if it doesn't exist
api.create_repo(
    repo_id=FULL_ID,
    repo_type="model",
    private=False,
    exist_ok=True
)

# Upload all model files
api.upload_folder(
    folder_path=FULL_DIR,
    repo_id=FULL_ID,
    repo_type="model",
    path_in_repo=".",
    commit_message="Add final DPO fine-tuned checkpoint (merged)"
)

# Write and upload README.md
card_path = Path(FULL_DIR) / "README.md"
card_path.write_text(f"""---
license: apache-2.0
tags:
  - qwen3
  - dpo
---

# Qwen3-0.6B • DPO fine-tuned

**Base model**: Qwen/Qwen3-0.6B  
**SFT**: GingerBled/qwen3-0.6B-FullFineTune
**DPO dataset**: GingerBled/MNLP_M2_dpo_dataset  
**Hardware**: NVIDIA A800 20 GB  
**Epochs**: 1  
**Method**: Direct Preference Optimization (DPO)

```python
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("{FULL_ID}")
tokenizer = AutoTokenizer.from_pretrained("{FULL_ID}")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
print(pipe("Explain the Pythagorean theorem in one sentence:")[0]["generated_text"])
```""")

api.upload_file(path_or_fileobj=card_path, repo_id=FULL_ID, path_in_repo="README.md")

print(f"All done! Model has been trained and uploaded to {FULL_ID}")