# Multi-GPU Training for Novel Writer
Distributed training using HuggingFace Accelerate for faster fine-tuning across multiple GPUs.

In [None]:
!pip install -q accelerate transformers peft bitsandbytes datasets trl

In [None]:
import json
import os
from pathlib import Path
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from datasets import Dataset
from trl import SFTTrainer
import torch

## Initialize Accelerator

In [None]:
accelerator = Accelerator()
print(f"Using {accelerator.num_processes} GPUs")
print(f"Device: {accelerator.device}")
print(f"Mixed precision: {accelerator.mixed_precision}")

## Load Dataset

In [None]:
DATA_PATH = "data/processed/train.jsonl"

entries = []
with open(DATA_PATH, "r") as f:
    for line in f:
        data = json.loads(line.strip())
        instruction = data.get("instruction", "Continue writing:")
        output = data.get("output", data.get("text", ""))
        entries.append({"text": f"### Instruction:\n{instruction}\n\n### Response:\n{output}"})

dataset = Dataset.from_list(entries)
print(f"Loaded {len(dataset)} training examples")

## Load Model with QLoRA

In [None]:
MODEL_NAME = "unsloth/llama-3-8b-instruct-bnb-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map={"":  accelerator.local_process_index},
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

## Train

In [None]:
training_args = TrainingArguments(
    output_dir="multigpu_output",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    bf16=True,
    ddp_find_unused_parameters=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    peft_config=lora_config,
)

trainer.train()

## Save Model

In [None]:
if accelerator.is_main_process:
    trainer.save_model("multigpu_lora_model")
    tokenizer.save_pretrained("multigpu_lora_model")
    print("Multi-GPU trained model saved to multigpu_lora_model/")

## Launch Command
To run this notebook as a script with multi-GPU:
```bash
accelerate launch --config_file training/accelerate_config.yaml training/train_multigpu.py
```