In [1]:
import time
import threading
import subprocess
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import HfApi
from pathlib import Path
from trl.trainer.utils import DPODataCollatorWithPadding
from datasets import load_dataset, Features, Value

In [None]:
os.environ["HF_TOKEN"] ='your_huggingface_token_here'
!huggingface-cli login --token $HF_TOKEN

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `VSCode` has been saved to /home/.cache/huggingface/stored_tokens
Your token has been saved to /home/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
# Load dataset safely with defined schema
dataset = load_dataset(
    "GingerBled/MNLP_M2_dpo_dataset",
    data_files="MNLP_M2_dpo_dataset.jsonl",
    split="train"
)

# Shuffle and split into train/test sets
split_dataset = dataset.shuffle(seed=2025).train_test_split(test_size=0.1)

train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

columns_to_keep = ["prompt", "chosen", "rejected"]
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in columns_to_keep])
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col not in columns_to_keep])

In [4]:
# ==== Load model and tokenizer ====
print("Loading model and tokenizer...")
model_id = "GingerBled/qwen3-0.6B-LoRA_SFT" 
#tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

# Configure padding tokens
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# ==== Define parameters ====
max_prompt_length = 256
max_target_length = 256
max_length = 512

# ==== Training arguments ====
training_args = DPOConfig(
    output_dir="./qwen3_dpo_checkpoints",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    fp16=True,
    report_to="none",
    beta=0.1,
    max_prompt_length=512,
    max_completion_length=512,
    truncation_mode="keep_end",
)

Loading model and tokenizer...


In [5]:
def keep_if_under_token_limit(example, tokenizer, threshold=1024):
    total_len = len(tokenizer(example["prompt"] + example["chosen"])["input_ids"])
    return total_len <= threshold

def find_long_examples(dataset, tokenizer, threshold=1024):
    for i, ex in enumerate(dataset):
        total_len = len(tokenizer(ex["prompt"] + ex["chosen"])["input_ids"])
        if total_len > threshold:
            print(f"⚠️ Example {i} too long: {total_len} tokens")

# Merge train and test datasets
full_dataset = concatenate_datasets([train_dataset, test_dataset])
full_dataset = full_dataset.filter(lambda ex: keep_if_under_token_limit(ex, tokenizer))
find_long_examples(full_dataset, tokenizer)
split_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

Filter:   0%|          | 0/20354 [00:00<?, ? examples/s]

In [6]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer
)

Extracting prompt in train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/15880 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1765 [00:00<?, ? examples/s]

In [7]:
torch.cuda.empty_cache() 

In [8]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.8635
20,0.8255
30,0.9644
40,1.0524
50,1.2995
60,0.9187
70,0.9299
80,1.0368
90,0.949
100,0.8924


TrainOutput(global_step=5955, training_loss=0.5950496001567889, metrics={'train_runtime': 26192.8059, 'train_samples_per_second': 1.819, 'train_steps_per_second': 0.227, 'total_flos': 0.0, 'train_loss': 0.5950496001567889, 'epoch': 3.0})

In [None]:
os.environ["HF_TOKEN"] ='your_huggingface_token_here'
!huggingface-cli login --token $HF_TOKEN

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `VSCode` has been saved to /home/.cache/huggingface/stored_tokens
Your token has been saved to /home/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [35]:
print(tokenizer("test").keys())

dict_keys(['input_ids', 'attention_mask'])


In [10]:
# ==== Save the model ====
print("Saving model...")
trainer.save_model("DPO_model_local")
tokenizer.save_pretrained("DPO_model_local")

# Create merged model
print("Creating merged model...")
merged_model = model

# Save merged model to new directory
FULL_DIR = "qwen3-0.6B-DPO"
merged_model.save_pretrained(FULL_DIR, safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained(FULL_DIR)

# Define repo name and upload location
api = HfApi()
username = api.whoami()["name"]
ORG = "GingerBled"
REPO = "qwen3-0.6B-DPO"
FULL_ID = f"{ORG}/{REPO}"

# Create repo if it doesn't exist
api.create_repo(
    repo_id=FULL_ID,
    repo_type="model",
    private=False,
    exist_ok=True
)

# Upload all model files
api.upload_folder(
    folder_path=FULL_DIR,
    repo_id=FULL_ID,
    repo_type="model",
    path_in_repo=".",
    commit_message="Add final DPO fine-tuned checkpoint (merged)"
)

# Write and upload README.md
card_path = Path(FULL_DIR) / "README.md"
card_path.write_text(f"""---
license: apache-2.0
tags:
  - qwen3
  - dpo
---

# Qwen3-0.6B • DPO fine-tuned

**Base model**: Qwen/Qwen3-0.6B  
**SFT**: GingerBled/qwen3-0.6B-LoRA_SFT  
**DPO dataset**: GingerBled/MNLP_M2_dpo_dataset  
**Hardware**: NVIDIA A800 20 GB  
**Epochs**: 3  
**Method**: Direct Preference Optimization (DPO)

```python
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("{FULL_ID}")
tokenizer = AutoTokenizer.from_pretrained("{FULL_ID}")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
print(pipe("Explain the Pythagorean theorem in one sentence:")[0]["generated_text"])
```""")

api.upload_file(path_or_fileobj=card_path, repo_id=FULL_ID, path_in_repo="README.md")

print(f"All done! Model has been trained and uploaded to {FULL_ID}")

Saving model...
Creating merged model...


model-00002-of-00002.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

All done! Model has been trained and uploaded to GingerBled/qwen3-0.6B-DPO


In [None]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("GingerBled/qwen3-0.6B-DPO")
tokenizer = AutoTokenizer.from_pretrained("GingerBled/qwen3-0.6B-DPO")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
print(pipe("Explain the Pythagorean theorem in one sentence:")[0]["generated_text"])


config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/390M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.40k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

Device set to use cuda:0
