In [None]:
# import sys
# !{sys.executable} -m pip install transformers torch accelerate bitsandbytes peft trl datasets tensorboard # Already installed

# Import necessary packages for the fine-tuning process
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import login

# Hugging Face login (replace with your actual token or set as environment variable)
HF_TOKEN = "your_token"
login(token=HF_TOKEN)

# Model and dataset configuration
model_name = "username/model_name"
dataset_name = "data/WVQ_all.jsonl"
new_model = "new_model_name"

################################################################################
# FIXED LoRA parameters
################################################################################
lora_r = 32
lora_alpha = 64
lora_dropout = 0.2

################################################################################
# bitsandbytes parameters
################################################################################
compute_dtype = getattr(torch, "float16")
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

################################################################################
# IMPROVED TrainingArguments parameters
################################################################################
output_dir = "./new_model_name"
num_train_epochs = 3
fp16 = True
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16
gradient_checkpointing = True
# Add this for DDP compatibility when using gradient_checkpointing
gradient_checkpointing_kwargs={'use_reentrant':False}
max_grad_norm = 0.3
learning_rate = 2e-5
weight_decay = 0
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.2
group_by_length = True
save_steps = 100
logging_steps = 10

In [None]:
################################################################################
# SFT parameters
################################################################################
# Set device_map to "auto" for automatic multi-GPU handling by Accelerate
device_map = "auto"

# Step 1: Load and inspect dataset
dataset = load_dataset('json', data_files=dataset_name, split="train")
print(f"Dataset size: {len(dataset)}")
print(f"Dataset columns: {dataset.column_names}")
print(f"Sample entry: {dataset[0]}")

# Step 2: Load tokenizer and model with QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Step 3: Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Step 4: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Step 5: IMPROVED LoRA configuration
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Step 7: Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    # Crucial for multi-GPU with gradient checkpointing
    gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
)

# Step 8: Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
)

In [None]:
# Step 9: Train model
trainer.train()

# Step 10: Save trained model
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

# Step 11: Push to hub with proper model name
trainer.push_to_hub(new_model)

print("Training completed successfully!")