In [None]:
from diffusers import StableDiffusionPipeline
from torch.utils.data import DataLoader, Dataset
import torch
from torchvision import transforms
from PIL import Image

# Define a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, images, prompts):
        self.images = images
        self.prompts = prompts

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.prompts[idx]


In [None]:
import os
from transformers import pipeline
# from groq import Groq
# client = Groq(api_key = "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")

# MODEL = 'Llama-3.1-8b-instant'


def generate_text_prompt_with_gpt(objects):
    prompt = []
    for i in range(len(objects['category'])):
        element = objects['category'][i] if objects['category'][i] is not None else "N/A"
        color = objects['color'][i] if objects['color'][i] is not None else "N/A"
        radius = objects['radius'][i] if objects['radius'][i] is not None else "N/A"
        text = objects['text'][i] if objects['text'][i] is not None else "N/A"
        bbox = objects['bbox'][i] if objects['bbox'][i] is not None else "N/A"

        user_prompt = f"Create a detailed description for a mobile UI element with the following attributes:\n" \
                      f"Element: {element}, Color: {color}, Radius: {radius}, Text: {text}, Position: {bbox}.\n" \
                      f"Provide a natural and context-rich description for this UI element."

        prompt.append(user_prompt)

    # Combine descriptions of all UI elements into a full prompt
    return " ".join(prompt)


In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL
from transformers import CLIPTextModel
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader
import torch
from torchvision import transforms
from accelerate import Accelerator
import os
from huggingface_hub import login

# Access the token from Colab secrets
from google.colab import userdata
hf_token = userdata.get('HUGGINGFACE_TOKEN')

# Authenticate with Hugging Face
if hf_token:
    login(hf_token)

# Load dataset
print("Loading dataset...")
dataset = load_dataset("mrtoy/mobile-ui-design")["train"]
count = 0
print(f"Dataset loaded: {len(dataset)} samples")

# Generate prompts and prepare images
print("Generating prompts and transforming images...")
prompts = []
for item in dataset:
    if count >= 1000:
        break
    prompts.append(generate_text_prompt_with_gpt(item['objects']))
    count += 1

transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])
count = 0
images = []
for item in dataset:
    if count >= 1000:
        break
    images.append(transform(item['image'].convert("RGB")))
    count += 1

# Create DataLoader
train_dataset = CustomDataset(images, prompts)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Initialize components for Stable Diffusion Lite
model_id = "stabilityai/stable-diffusion-3-medium"  # Lightweight model
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder")
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")

# Apply LoRA for Fine-Tuning
print("Applying LoRA to UNet...")
lora_config = LoraConfig(r=4, lora_alpha=32, target_modules=["cross_attn"], dropout=0.05, bias="none")
unet = get_peft_model(unet, lora_config)

# Fine-tuning configurations
optimizer = torch.optim.AdamW(unet.parameters(), lr=1e-4)
accelerator = Accelerator()
unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)

# Training Loop
print("Starting training loop...")
unet.train()
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs} started")
    for images, texts in train_dataloader:
        # Move data to the accelerator device
        images = images.to(accelerator.device)
        texts = text_encoder(texts.to(accelerator.device))[0]

        # Prepare latents
        latents = vae.encode(images).latent_dist.sample()
        latents = latents * vae.config.scaling_factor

        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (latents.size(0),), device=latents.device).long()

        noisy_latents = latents + noise

        # UNet output and loss computation
        pred_noise = unet(noisy_latents, timesteps, texts).sample
        loss = torch.nn.functional.mse_loss(pred_noise, noise)

        # Backward pass
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}: Loss {loss.item()}")

# Save the fine-tuned LoRA model
print("Saving fine-tuned model...")
unet.save_pretrained("fine_tuned_unet_lora")


Loading dataset...
Dataset loaded: 7846 samples
Generating prompts and transforming images...


OSError: stabilityai/stable-diffusion-3-medium does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

In [None]:
!cd ..

In [None]:
# !git clone https://github.com/huggingface/diffusers
!cd diffusers


In [None]:
!ls

diffusers  sample_data


In [None]:
!cd examples/text_to_image

/bin/bash: line 1: cd: examples/text_to_image: No such file or directory


In [None]:
!pip install -r diffusers/examples/text_to_image/requirements_sdxl.txt



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

In [None]:
!pip install diffusers["torch"] transformers

Collecting torch<2.5.0,>=1.4 (from diffusers[torch])
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<2.5.0,>=1.4->diffusers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch<2.5.0,>=1.4->diffusers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch<2.5.0,>=1.4->diffusers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<2.5.0,>=1.4->diffusers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch<2.5.0,>=1.4->diffusers[torch])
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_

In [None]:
!accelerate config default

accelerate configuration saved at /root/.cache/huggingface/accelerate/default_config.yaml


In [None]:
!accelerate launch diffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py \
  --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
  --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
  --dataset_name="mrtoy/mobile-ui-design" \
  --validation_prompt="An inventor tinkers with a complex machine in his workshop, oblivious to the setting sun outside" \
  --num_validation_images=4 \
  --validation_epochs=1 \
  --output_dir="output" \
  --resolution=1024 \
  --center_crop \
  --random_flip \
  --train_text_encoder \
  --train_batch_size=1 \
  --num_train_epochs=10 \
  --checkpointing_steps=500 \
  --gradient_accumulation_steps=4 \
  --learning_rate=1e-04 \
  --lr_warmup_steps=0 \
  --dataloader_num_workers=8 \
  --allow_tf32 \
  --mixed_precision="fp16" \
  --push_to_hub \
  --hub_model_id="sdxl-lora-testing"



Traceback (most recent call last):
  File "/usr/local/bin/accelerate", line 5, in <module>
    from accelerate.commands.accelerate_cli import main
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 19, in <module>
    from accelerate.commands.estimate import estimate_command_parser
  File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/estimate.py", line 34, in <module>
    import timm
  File "/usr/local/lib/python3.10/dist-packages/timm/__init__.py", line 2, in <module>
    from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable
  File "/usr/local/lib/python3.10/dist-packages/timm/layers/__init__.py", line 8, in <module>
    from .classifier import create_classifier, ClassifierHead, NormMlpClassifierHead, ClNormMlpClassifierHead
  File "/usr/local/lib/python3.10/dist-packages/timm/layers/classifier.py", line 15, in <module>
    from .create_norm import get_norm_layer
  File "/usr/local/lib/python3.10/dist-