In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
ds = load_dataset("CarperAI/openai_summarize_comparisons")

In [3]:
# Define the model name
model_name = "Qwen/Qwen3-1.7B"

# Check for CUDA availability and set up device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model loading configuration
model_kwargs = {"trust_remote_code": True}

if device == "cuda":
    # Configure quantization for 4-bit model loading on GPU
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model_kwargs["quantization_config"] = bnb_config
    model_kwargs["device_map"] = "auto"
else:
    # Load the model in full precision on CPU
    print("CUDA not found. Loading model on CPU without quantization.")
    # When on CPU, we don't use device_map or quantization

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False # Disable cache for training

# If on CPU, we need to explicitly move the model to the device
if device == "cpu":
    model.to(device)

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Load the dataset from Hugging Face
dataset = load_dataset("CarperAI/openai_summarize_comparisons")

# We only need the 'train' and 'valid1' splits for this SFT example
train_dataset = dataset["train"]
eval_dataset = dataset["valid1"]

# For this PoC, let's use a smaller subset of the data
# This makes training and evaluation faster
train_dataset = train_dataset.select(range(1000))
eval_dataset = eval_dataset.select(range(200))

# System prompt to guide the model
system_prompt = "You are a helpful assistant that summarizes text with the same voice as the author."

def tokenize_function(examples):
    """
    Applies the chat template to the prompt and chosen summary,
    then tokenizes the result.
    """
    # Create the full chat prompt for each example in the batch
    prompts = []
    for i in range(len(examples["prompt"])):
        chat = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Summarize the following post:\n{examples['prompt'][i]}"},
            {"role": "assistant", "content": examples['chosen'][i]}
        ]
        prompts.append(tokenizer.apply_chat_template(chat, tokenize=False))
    
    # Tokenize the formatted prompts
    tokenized_inputs = tokenizer(
        prompts,
        truncation=True,
        max_length=1024, # This was the max_seq_length before
        padding=False, # The data collator will handle padding
    )
    
    return tokenized_inputs

# Apply the tokenization function to the datasets
# We use batched=True for efficiency and remove the old columns.
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

In [None]:
# Prepare the model for k-bit training only if on GPU
if device == "cuda":
    model = prepare_model_for_kbit_training(model)

# LoRA configuration
# This object holds all the settings for our LoRA adapters.
lora_config = LoraConfig(
    # r: This is the "rank" of the LoRA update matrices. It's the most important
    # hyperparameter. A smaller 'r' means fewer trainable parameters, resulting in a
    # smaller adapter file and faster training. A larger 'r' means more parameters,
    # which can potentially lead to better performance but requires more memory.
    # A value of 16 is a common and effective starting point.
    r=16,

    # lora_alpha: This is the scaling factor for the LoRA updates. The learned LoRA
    # weights are scaled by (lora_alpha / r). This means a higher alpha gives more
    # weight to the LoRA activations. A common rule of thumb is to set lora_alpha
    # to be twice the value of r.
    lora_alpha=32,

    # target_modules: This is a list of the specific modules within the model that we
    # want to apply LoRA adapters to. For transformer models, targeting the query,
    # key, value, and output projection layers of the attention mechanism is the most
    # common and effective strategy. These are the layers where the model does most
    # of its "thinking" about token relationships.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],

    # lora_dropout: This applies dropout regularization to the LoRA layers. During
    # training, it will randomly set 5% of the LoRA activations to zero. This helps
    # prevent the model from overfitting to the training data, making it generalize
    # better to new, unseen examples.
    lora_dropout=0.05,
    bias="none",

    # task_type: This tells PEFT the type of task we are performing. For models like
    # Qwen, which are trained to predict the next token in a sequence, "CAUSAL_LM"
    # (Causal Language Modeling) is the correct type. This helps PEFT set up the
    # model architecture correctly for the fine-tuning task.
    task_type="CAUSAL_LM",
)

# Add LoRA adapters to the model
# This is the final step where we wrap our base model with the LoRA configuration.
# The `get_peft_model` function takes the original model and the lora_config,
# finds all the `target_modules`, and injects the trainable LoRA adapters into them.
# The original weights of the model are frozen, and only the new adapter weights
# will be updated during training.
model = get_peft_model(model, lora_config)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen-sft-poc",
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    logging_steps=10,
    max_steps=100, # Limit steps for a quick PoC
    eval_strategy="steps",
    eval_steps=25,
    save_steps=100,
    load_best_model_at_end=True,
    report_to="none", # Disable wandb or other reporting for this PoC
    # pin_memory=torch.cuda.is_available(), # Only pin memory if a GPU is available
)

# Initialize the standard Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    # Data collator will dynamically pad the inputs and create labels
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Start the training
trainer.train()

# Save the fine-tuned model
trainer.save_model("./qwen-sft-poc/final_checkpoint")