# DeepSeek-R1-Distill-Qwen-1.5B Fine-tuning (Fixed)

Fixes applied vs original notebook:
1. **ChatML format** with `<|im_start|>`/`<|im_end|>` tokens (Qwen model family)
2. **`<think>` tags** for `reasoning_content` (R1-distill style)
3. **Label masking**: prompt tokens set to `-100`, only train on assistant response
4. **EOS token** included via `<|im_end|>`
5. **Dynamic padding** via DataCollator (not padding every sample to max_length)
6. **max_length** increased to 2048 (was 512)
7. **num_train_epochs** increased to 3 (was 1)
8. **LoRA r=16, alpha=32** (was r=8, alpha=16)
9. **LoRA applied to all linear layers** (attention + MLP)
10. **Warmup + cosine** learning rate scheduler

## Step 0: Configuration

In [None]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
DATA_FILE = "distill_psychology-10k-r1.json"
OUTPUT_DIR = "./finetuned_models"
MAX_LENGTH = 2048
NUM_EPOCHS = 3
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LEARNING_RATE = 2e-5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 16
INCLUDE_REASONING = True  # set False to train without <think> reasoning

## Step 1: Load Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

## Step 2: Prepare Dataset with ChatML Format

Data format per sample:
```
<|im_start|>user
{input}<|im_end|>
<|im_start|>assistant
<think>
{reasoning_content}
</think>

{content}<|im_end|>
```

In [None]:
import json

samples = []
skipped = 0

with open(DATA_FILE, "r", encoding="utf-8") as f:
    raw = f.read()

# Try parsing as a JSON array first, then fall back to JSONL
try:
    data_list = json.loads(raw)
    print("Parsed as JSON array")
except json.JSONDecodeError:
    # Parse as JSONL (one JSON object per line)
    data_list = []
    for i, line in enumerate(raw.splitlines()):
        line = line.strip()
        if not line:
            continue
        try:
            data_list.append(json.loads(line))
        except json.JSONDecodeError as e:
            skipped += 1
            if skipped <= 3:
                print(f"Skipping line {i+1}: {e}")
    print(f"Parsed as JSONL ({skipped} lines skipped)")

for item in data_list:
    user_msg = item["input"]
    response = item["content"]
    reasoning = item.get("reasoning_content", "")

    # Build assistant message with optional <think> block
    if INCLUDE_REASONING and reasoning:
        assistant_msg = f"<think>\n{reasoning}\n</think>\n\n{response}"
    else:
        assistant_msg = response

    samples.append({
        "user": user_msg,
        "assistant": assistant_msg,
    })

# Write processed JSONL
with open("dataset_processed.jsonl", "w", encoding="utf-8") as f:
    for sample in samples:
        f.write(json.dumps(sample, ensure_ascii=False) + "\n")

print(f"Loaded {len(samples)} samples")
print(f"Sample 0 user: {samples[0]['user'][:80]}...")
print(f"Sample 0 assistant: {samples[0]['assistant'][:80]}...")

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files={"train": "dataset_processed.jsonl"}, split="train"
)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"Total: {len(dataset)}")
print(f"Train: {len(train_dataset)}")
print(f"Eval:  {len(eval_dataset)}")

## Step 3: Tokenization with Label Masking

**Key fixes:**
- Prompt tokens are masked with `-100` in labels (model only learns the assistant response)
- **No `padding=max_length`** here — padding is handled dynamically per-batch by the DataCollator (much faster)

In [None]:
def tokenize_function(examples):
    """
    Tokenize with ChatML format and mask prompt tokens in labels.
    Only the assistant's response contributes to the training loss.
    NO padding here — DataCollator will pad dynamically per batch.
    """
    all_input_ids = []
    all_attention_mask = []
    all_labels = []

    for user_msg, assistant_msg in zip(examples["user"], examples["assistant"]):
        # Full ChatML-formatted text
        full_text = (
            f"<|im_start|>user\n{user_msg}<|im_end|>\n"
            f"<|im_start|>assistant\n{assistant_msg}<|im_end|>"
        )

        # Prompt portion (everything before the assistant's actual content)
        prompt_text = (
            f"<|im_start|>user\n{user_msg}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )

        # Tokenize full text — truncate but do NOT pad
        full_tokens = tokenizer(
            full_text,
            truncation=True,
            max_length=MAX_LENGTH,
            padding=False,
        )

        # Tokenize prompt-only to get its token length
        prompt_tokens = tokenizer(
            prompt_text,
            truncation=True,
            max_length=MAX_LENGTH,
            add_special_tokens=False,
        )
        prompt_len = len(prompt_tokens["input_ids"])

        # Build labels: -100 for prompt tokens (only train on assistant response)
        labels = full_tokens["input_ids"][:]
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100

        all_input_ids.append(full_tokens["input_ids"])
        all_attention_mask.append(full_tokens["attention_mask"])
        all_labels.append(labels)

    return {
        "input_ids": all_input_ids,
        "attention_mask": all_attention_mask,
        "labels": all_labels,
    }

In [None]:
print("Tokenizing train dataset...")
tokenized_train = train_dataset.map(
    tokenize_function, batched=True, remove_columns=train_dataset.column_names
)

print("Tokenizing eval dataset...")
tokenized_eval = eval_dataset.map(
    tokenize_function, batched=True, remove_columns=eval_dataset.column_names
)

print(f"Tokenized train: {tokenized_train}")
print(f"Tokenized eval:  {tokenized_eval}")

In [None]:
# Sanity check: verify label masking and token length distribution
sample_labels = tokenized_train[0]["labels"]
sample_ids = tokenized_train[0]["input_ids"]
n_masked = sum(1 for l in sample_labels if l == -100)
n_trained = len(sample_labels) - n_masked

print(f"Sample 0 stats:")
print(f"  Total tokens:  {len(sample_ids)}")
print(f"  Trained on:    {n_trained} tokens (assistant response)")
print(f"  Masked (-100): {n_masked} tokens (prompt)")

# Show token length distribution
lengths = [len(x) for x in tokenized_train["input_ids"]]
print(f"\nToken length distribution (train):")
print(f"  Min: {min(lengths)}, Max: {max(lengths)}, Avg: {sum(lengths)/len(lengths):.0f}")
print(f"  Truncated to {MAX_LENGTH}: {sum(1 for l in lengths if l == MAX_LENGTH)} samples")

## Step 4: Load Model with 8-bit Quantization

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
)

print("Model loaded with 8-bit quantization")

## Step 5: Apply LoRA (attention + MLP layers)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## Step 6: Training

Uses `DataCollatorForSeq2Seq` for **dynamic padding** — each batch is only padded to the longest sample in that batch, not to `max_length`. This is much faster than static padding.

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Dynamic padding: pads each batch to the longest sample in that batch
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    pad_to_multiple_of=8,  # for GPU efficiency
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    fp16=True,
    logging_steps=10,
    logging_first_step=True,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=50,
    learning_rate=LEARNING_RATE,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

print("Training args configured")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete!")

## Step 7: Save LoRA Adapter

In [None]:
adapter_path = f"{OUTPUT_DIR}/lora_adapter"
model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)
print(f"LoRA adapter saved to {adapter_path}")

## Step 8: Merge LoRA into Base Model

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

print("Loading base model for merging...")
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")

print("Merging LoRA adapter...")
merged_model = PeftModel.from_pretrained(base_model, adapter_path)
merged_model = merged_model.merge_and_unload()

merged_path = f"{OUTPUT_DIR}/merged_model"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)
print(f"Merged model saved to {merged_path}")

## Step 9: Test Fine-tuned Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

test_model = AutoModelForCausalLM.from_pretrained(merged_path).to("cuda")
test_tokenizer = AutoTokenizer.from_pretrained(merged_path)

test_prompts = [
    "我最近感到非常愤怒，但不知道原因是什么。",
    "我对所承担的所有责任感到非常不堪重负。",
    "我晚上总是睡不好觉，白天也没有精神。",
]

for prompt in test_prompts:
    # Use ChatML format for inference (must match training format)
    chat_input = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = test_tokenizer(chat_input, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = test_model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=test_tokenizer.convert_tokens_to_ids("<|im_end|>"),
        )

    response = test_tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
    )
    print(f"问: {prompt}")
    print(f"答: {response}")
    print("-" * 60)

## Step 10: Interactive Chat

In [None]:
print("Interactive chat started. Type 'quit' to exit.\n")

while True:
    prompt = input("问: ")
    if prompt.strip().lower() == "quit":
        print("再见！")
        break

    chat_input = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = test_tokenizer(chat_input, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = test_model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=test_tokenizer.convert_tokens_to_ids("<|im_end|>"),
        )

    response = test_tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
    )
    print(f"答: {response}\n")