<a href="https://colab.research.google.com/github/MuhammadFasihArif/Image-gen/blob/main/Image_Gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
git clone https://github.com/huggingface/diffusers.git
cd diffusers/examples/text_to_image
pip install -r requirements.txt accelerate
huggingface-cli login

export MODEL="CompVis/stable-diffusion-v1-4"
export TRAIN_DIR="/path/to/your/dataset"

accelerate launch train_text_to_image.py \
  --pretrained_model_name_or_path=$MODEL \
  --train_data_dir=$TRAIN_DIR \
  --use_ema \
  --resolution=512 --center_crop --random_flip \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --gradient_checkpointing \
  --mixed_precision="fp16" \
  --max_train_steps=5000 \
  --learning_rate=1e-5 \
  --output_dir="sd-finetuned"


In [None]:
# train_lora.py — adapted from harrywang/finetune-sd
import torch
from diffusers import StableDiffusionPipeline
from diffusers import LoraConfig, LoraLayer
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
from PIL import Image

class ImageTextDataset(Dataset):
    def __init__(self, root, captions_file, transform=None):
        self.root, self.transform = root, transform
        self.items = [line.strip().split('\t') for line in open(captions_file)]
    def __len__(self): return len(self.items)
    def __getitem__(self, i):
        img_path, caption = self.items[i]
        img = Image.open(f"{self.root}/{img_path}").convert("RGB")
        if self.transform: img = self.transform(img)
        return img, caption

def main():
    pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
    config = LoraConfig(r=4, lora_alpha=16, target_modules=["unet.up_blocks", "unet.down_blocks"])
    pipe.unet.enable_lora(config)

    transform = T.Compose([T.Resize(512), T.CenterCrop(512), T.RandomHorizontalFlip(), T.ToTensor()])
    ds = ImageTextDataset("data/images", "data/captions.tsv", transform)
    loader = DataLoader(ds, batch_size=1, shuffle=True)

    opt = torch.optim.AdamW(pipe.unet.parameters(), lr=1e-4)
    steps = 1000
    pipe.unet.train()
    for epoch in range(1):
        for i, (img, txt) in enumerate(loader):
            if i >= steps: break
            loss = pipe.train_step(img.to("cuda"), txt)
            loss.backward()
            opt.step(); opt.zero_grad()
            if i % 100 == 0:
                print(f"Step {i}, loss {loss.item():.4f}")
                pipe.save_pretrained("lora_output")

if __name__=="__main__":
    main()
