In [1]:
# Install required libraries
!pip install torch torchvision transformers diffusers ftfy
!pip install accelerate




In [2]:
import torch
from transformers import CLIPProcessor, CLIPModel
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from PIL import Image
import requests
!pip install torch torchvision diffusers transformers ftfy accelerate bitsandbytes
!pip install datasets
!pip install "git+https://github.com/huggingface/transformers.git"


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-z8qu5hur
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-z8qu5hur
  Resolved https://github.com/huggingface/transformers.git to commit 40821a247823b35d7ff10ba490d0d930fe8f5afa
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Upload your image
def upload_image():
    from google.colab import files
    uploaded = files.upload()
    for filename in uploaded.keys():
        return Image.open(filename)

# Process the image
def extract_image_features(image):
    inputs = clip_processor(text=None, images=image, return_tensors="pt", padding=True)
    image_features = clip_model.get_image_features(**inputs)
    return image_features

# Example usage
uploaded_image = upload_image()  # Upload the sample image
image_features = extract_image_features(uploaded_image)  # Extract image features


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saving output-2 (1).jpeg to output-2 (1).jpeg


In [15]:
from google.colab import files
from transformers import CLIPProcessor, CLIPModel
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL, DPMSolverMultistepScheduler
from torch.utils.data import Dataset, DataLoader
import torch
from PIL import Image
import os
from tqdm import tqdm
import random
from torchvision import transforms  # Import necessary transformation functions

# Function to upload images in Google Colab
def upload_image():
    uploaded = files.upload()
    image_path = list(uploaded.keys())[0]  # Get the uploaded image file name
    return image_path

# Load and Preprocess Your Dataset
class CustomDataset(Dataset):
    def __init__(self, image_folder, captions, processor):
        self.image_folder = image_folder
        self.captions = captions
        self.processor = processor  # Add the processor to encode captions
        self.image_paths = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f)) and f.lower().endswith(('png', 'jpg', 'jpeg'))]

        # Define the transformation to convert images to tensor
        self.transform = transforms.Compose([
            transforms.Resize((512, 512)),  # Resize to a standard size (optional)
            transforms.ToTensor(),  # Convert PIL image to tensor
            transforms.Lambda(lambda x: x * 2 - 1)  # Normalize to [-1, 1] range
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_folder, self.image_paths[idx])
        image = Image.open(image_path).convert("RGB")

        # Convert image to RGBA (4 channels)
        image = image.convert("RGBA")  # Convert to 4 channels

        image = self.transform(image)  # Apply the transformation to the image

        # Tokenize the caption to get encoder hidden states
        caption = self.captions[idx]
        inputs = self.processor(text=caption, return_tensors="pt", padding=True, truncation=True)
        encoder_hidden_states = inputs.input_ids.squeeze(0)  # Flatten the batch dimension

        return {"image": image, "encoder_hidden_states": encoder_hidden_states}

# Define the training loop
def train(model, train_loader, optimizer, scheduler, device, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        loop = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in loop:
            images = batch["image"].to(device).half()  # Convert images to float16
            encoder_hidden_states = batch["encoder_hidden_states"].to(device).long()

            # Generate a random timestep for each batch (simulating denoising process)
            timesteps = torch.randint(0, 1000, (images.size(0),), device=device)  # Example random timesteps

            # Here you could add custom logic for conditioning the model on the caption and image
            outputs = model(images, encoder_hidden_states=encoder_hidden_states, timestep=timesteps)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            loop.set_postfix(loss=loss.item())

# Function to load the pre-trained model
def load_stable_diffusion():
    model_id = "CompVis/stable-diffusion-v1-4"  # Pre-trained Stable Diffusion
    pipeline = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
    pipeline = pipeline.to("cuda")
    return pipeline

# Fine-tune Stable Diffusion
def fine_tune_stable_diffusion(image_folder, captions, processor, num_epochs=5):
    # Load your dataset
    dataset = CustomDataset(image_folder, captions, processor)
    train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

    # Load the Stable Diffusion model
    pipeline = load_stable_diffusion()

    # Fine-tuning the UNet model only (UNet2DConditionModel)
    unet = pipeline.unet
    optimizer = torch.optim.AdamW(unet.parameters(), lr=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

    # Train the model
    train(unet, train_loader, optimizer, scheduler, torch.device("cuda"), num_epochs)

    # Save the fine-tuned model
    unet.save_pretrained("fine_tuned_unet_model")
    return pipeline

# Upload image
uploaded_image_path = upload_image()
print(f"Uploaded Image Path: {uploaded_image_path}")

# Define the image folder and captions
# Move the uploaded image to a known folder for consistency
image_folder = "/content/images/"
os.makedirs(image_folder, exist_ok=True)
os.rename(uploaded_image_path, os.path.join(image_folder, uploaded_image_path))

# Define captions for the fine-tuning images
captions = [
    "A young woman with dark hair, holding a British Longhair cat in a cozy room.",
    "A smiling girl in a pink sweater holding a fluffy cat in her hands.",
    "A calm and relaxed young woman with long dark hair and a pink sweater, holding a cute fluffy cat.",
    "A young woman with expressive eyes and soft features, holding her cat in a warm, cozy atmosphere."
]

# Initialize the CLIPProcessor to handle the text embeddings (using CLIP model)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Fine-tune the model with the uploaded image
fine_tuned_model = fine_tune_stable_diffusion(image_folder, captions, processor)

# Generate images with the fine-tuned model
def generate_image_with_finetuned_model(prompt, model):
    generated_images = model(prompt).images
    return generated_images

# Example usage with a new prompt
prompt = "A beautiful girl holding a fluffy British Longhair cat in a cozy room."
generated_images = generate_image_with_finetuned_model(prompt, fine_tuned_model)

# Display and save the generated images
for idx, img in enumerate(generated_images):
    img.show()
    img.save(f"generated_image_{idx}.png")


Saving output-0 (2).jpeg to output-0 (2) (1).jpeg
Saving output-1 (3).jpeg to output-1 (3) (7).jpeg
Saving output-2 (1).jpeg to output-2 (1) (7).jpeg
Saving output-3 (1).jpeg to output-3 (1) (7).jpeg
Uploaded Image Path: output-0 (2) (1).jpeg


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/1 [00:00<?, ?it/s]


ValueError: not enough values to unpack (expected 3, got 2)

In [13]:
# Load Stable Diffusion model
def load_stable_diffusion():
    sd_pipeline = StableDiffusionPipeline.from_pretrained(
        "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, revision="fp16"
    )
    sd_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipeline.scheduler.config)
    sd_pipeline = sd_pipeline.to("cuda")
    return sd_pipeline

# Generate image based on text prompt and image features
def generate_new_design(prompt, image_features, sd_pipeline, num_inference_steps=50):
    with torch.no_grad():
        # Merge image features into the prompt
        combined_prompt = f"{prompt}. Inspired by the uploaded image."
        generated_images = sd_pipeline(combined_prompt, num_inference_steps=num_inference_steps).images
        return generated_images

# Example usage
stable_diffusion_pipeline = load_stable_diffusion()
prompt = "A stunning young woman with shoulder-length dark hair, subtle highlights, and expressive almond-shaped eyes, holding a fluffy British Longhair cat in her arms. She wears a cozy, oversized pink sweater and stands in a softly lit room with a warm, natural glow. The scene radiates a serene and comforting atmosphere, with the cat snuggled contentedly against her."
generated_images = generate_new_design(prompt, image_features, stable_diffusion_pipeline)

# Display the generated images
for idx, img in enumerate(generated_images):
    img.show()
    img.save(f"generated_design_{idx}.png")




Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

An error occurred while trying to fetch /root/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/2880f2ca379f41b0226444936bb7a6766a227587/vae: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/2880f2ca379f41b0226444936bb7a6766a227587/vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
An error occurred while trying to fetch /root/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/2880f2ca379f41b0226444936bb7a6766a227587/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /root/.cache/huggingface/hub/models--CompVis--stable-diffusion-v1-4/snapshots/2880f2ca379f41b0226444936bb7a6766a227587/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Token indices sequence length is longer than the specified maximum sequence length for this mo

  0%|          | 0/50 [00:00<?, ?it/s]