In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import login

# Log in to the Hugging Face Hub (adjust token as needed)
login(token="your_token")

# Determine the local rank (for DDP); default to 0 if not set.
local_rank = int(os.environ.get("LOCAL_RANK", 0))

# Load the dataset
dataset = load_dataset("json", data_files="./data/fine_tune_data.jsonl", split="train")

model_name = "your_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    """
    If you want to processed your data in a more formatted form, like Question/ Answer format,
    the function goes here.
    """

dataset = dataset.map(preprocess_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Configure 4-bit quantization settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# Load the model using the local rank so that each process puts the model on its own GPU.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": local_rank},
)

# Prepare the model for k-bit training.
model = prepare_model_for_kbit_training(model)

if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()

# Set up LoRA configuration.
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Freeze all parameters except for those in LoRA layers.
for name, param in model.named_parameters():
    if "lora_" in name or "adapter" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.print_trainable_parameters()

# Set up training arguments.
training_args = TrainingArguments(
    output_dir="./output_dir",
    per_device_train_batch_size=2,       # Batch size per GPU
    gradient_accumulation_steps=8,         # Simulates a larger batch size
    optim="adamw_torch",
    num_train_epochs=3,                   # For most of the tasks, 3 is enough. Sometimes you need to reduce to 1 or 2 to reduce overfitting.
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=1,
    logging_first_step=True,
    log_level="info",
    prediction_loss_only=False,
    fp16=True,
    bf16=False,
    gradient_checkpointing=True,
    dataloader_pin_memory=True,
    ddp_find_unused_parameters=False,  # Optional: if your model permits it
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Start training.
trainer.train()

# Save the model and tokenizer.
model.save_pretrained("./place_for_model")
tokenizer.save_pretrained("./place_for_tokenizer")

In [None]:
# For multi-GPU tasks, use python -m torch.distributed.launch --nproc_per_node=<NUM_GPUS> finetune.py