# CLIP Guided Stable Diffusion using [d🧨ffusers](https://github.com/huggingface/diffusers)

This notebook shows how to do CLIP guidance with Stable diffusion using diffusers libray. This allows you to use newly released [CLIP models by LAION AI.](https://huggingface.co/laion).

This notebook is based on the following amazing repos, all credits to the original authors!

- https://github.com/Jack000/glid-3-xl
- https://github.dev/crowsonkb/k-diffusion


### Initial Setup

In [6]:
#@title Instal dependancies
!pip install diffusers[torch] transformers scipy ftfy accelerate datasets --upgrade
!pip install numpy==1.24.3
!pip install gradio



#### Authenticate with Hugging Face Hub

To use private and gated models on 🤗 Hugging Face Hub, login is required. If you are only using a public checkpoint (such as `CompVis/stable-diffusion-v1-4` in this notebook), you can skip this step.

In [7]:
# #@title Login
# from huggingface_hub import notebook_login

# notebook_login()

### CLIP Guided Stable Diffusion

In [8]:
#@title Load the pipeline
import torch
from PIL import Image

from diffusers import LMSDiscreteScheduler, DiffusionPipeline, PNDMScheduler
from transformers import CLIPFeatureExtractor, CLIPModel

model_id = "CompVis/stable-diffusion-v1-4" #@param {type: "string"}
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" #@param ["laion/CLIP-ViT-B-32-laion2B-s34B-b79K", "laion/CLIP-ViT-L-14-laion2B-s32B-b82K", "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", "laion/CLIP-ViT-g-14-laion2B-s12B-b42K", "openai/clip-vit-base-patch32", "openai/clip-vit-base-patch16", "openai/clip-vit-large-patch14"] {allow-input: true}
scheduler = "plms" #@param ['plms', 'lms']


def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size

    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

if scheduler == "lms":
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
else:
    scheduler = PNDMScheduler.from_config(model_id, subfolder="scheduler")


feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)


guided_pipeline = DiffusionPipeline.from_pretrained(
    model_id,
    custom_pipeline="clip_guided_stable_diffusion",
    custom_revision="main",  # TODO: remove if diffusers>=0.12.0
    clip_model=clip_model,
    feature_extractor=feature_extractor,
    scheduler=scheduler,
    torch_dtype=torch.float16,
)
guided_pipeline = guided_pipeline.to("cuda")

  deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [18]:
import gradio as gr
import torch
from diffusers import StableDiffusionPipeline
from transformers import CLIPProcessor, CLIPModel

# Define your clip-guided pipeline (function defined earlier)
def create_clip_guided_pipeline(stable_model_id="CompVis/stable-diffusion-v1-4", clip_model_id="openai/clip-vit-base-patch32"):
    # Stable Diffusion pipeline
    stable_pipeline = StableDiffusionPipeline.from_pretrained(stable_model_id)
    stable_pipeline = stable_pipeline.to("cuda") if torch.cuda.is_available() else stable_pipeline.to("cpu")

    # CLIP model
    clip_model = CLIPModel.from_pretrained(clip_model_id)
    clip_processor = CLIPProcessor.from_pretrained(clip_model_id)

    # Function to calculate CLIP loss
    def clip_loss(image, prompt):
        inputs = clip_processor(text=prompt, images=image, return_tensors="pt", padding=True).to(device)
        outputs = clip_model(**inputs)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
        loss = -torch.cosine_similarity(image_embeds, text_embeds).mean()
        return loss

    return stable_pipeline  # Adjust as needed

# Global variable to track the last model used
last_model = None
guided_pipeline = None  # Global variable to store the pipeline


def infer(prompt, clip_prompt, samples, steps, clip_scale, scale, seed, clip_model, use_cutouts, num_cutouts):
    global last_model
    global guided_pipeline

    # Check if we need to initialize or reinitialize the pipeline
    if last_model != clip_model:
        print(f"Reinitializing model: {clip_model}")
        guided_pipeline = create_clip_guided_pipeline(stable_model_id="CompVis/stable-diffusion-v1-4", clip_model_id=clip_model)
        last_model = clip_model
    else:
        print(f"Using cached model: {last_model}")

    # Parameters
    num_samples = samples
    num_inference_steps = steps
    guidance_scale = scale
    clip_guidance_scale = clip_scale
    seed = seed

    # Image generation
    generator = torch.Generator(device="cuda").manual_seed(seed)
    images = []
    for _ in range(num_samples):
        image = guided_pipeline(
            prompt,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator
        ).images[0]
        images.append(image)

    return images

# Define Gradio Interface
css = """
        .gradio-container {
            font-family: 'IBM Plex Sans', sans-serif;
        }
"""

block = gr.Blocks(css=css)

examples = [
    ['A high tech solarpunk utopia in the Amazon rainforest', 2, 45, 7.5, 1024],
    ['A pikachu fine dining with a view to the Eiffel Tower', 2, 45, 7, 1024],
]

with block:
    gr.HTML("""
        <div style="text-align: center; max-width: 650px; margin: 0 auto;">
            <h1>CLIP Guided Stable Diffusion Demo</h1>
        </div>
    """)

    with gr.Group():
        with gr.Column():
            text = gr.Textbox(label="Enter your prompt", show_label=False, max_lines=1, placeholder="Enter your prompt")
            btn = gr.Button("Generate image")

        gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery")

        advanced_button = gr.Button("Advanced options")

        with gr.Row():
            clip_prompt = gr.Textbox(label="Enter a CLIP prompt if you want it to differ", show_label=False, max_lines=1, placeholder="Enter a CLIP prompt if you want it to differ")
            samples = gr.Slider(label="Images", minimum=1, maximum=2, value=1, step=1)
            steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=45, step=1)
            use_cutouts = gr.Checkbox(label="Use cutouts?")
            num_cutouts = gr.Slider(label="Cutouts", minimum=1, maximum=16, value=4, step=1)

        clip_model = gr.Dropdown(["laion/CLIP-ViT-B-32-laion2B-s34B-b79K", "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"], value="laion/CLIP-ViT-B-32-laion2B-s34B-b79K", show_label=False)
        scale = gr.Slider(label="Guidance Scale", minimum=0, maximum=50, value=7.5, step=0.1)
        seed = gr.Slider(label="Seed", minimum=0, maximum=2147483647, step=1, randomize=True)
        clip_scale = gr.Slider(label="CLIP Guidance Scale", minimum=0, maximum=5000, value=100, step=1)

        ex = gr.Examples(examples=examples, fn=infer, inputs=[text, clip_prompt, samples, steps, scale, clip_scale, seed, clip_model, use_cutouts, num_cutouts], outputs=gallery, cache_examples=False)

        text.submit(infer, inputs=[text, clip_prompt, samples, steps, scale, clip_scale, seed, clip_model, use_cutouts, num_cutouts], outputs=gallery)
        btn.click(infer, inputs=[text, clip_prompt, samples, steps, scale, clip_scale, seed, clip_model, use_cutouts, num_cutouts], outputs=gallery)

block.launch(debug=True)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7ef050f63f1bcfb167.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Reinitializing model: laion/CLIP-ViT-B-32-laion2B-s34B-b79K


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

  0%|          | 0/45 [00:00<?, ?it/s]

Using cached model: laion/CLIP-ViT-B-32-laion2B-s34B-b79K


  0%|          | 0/45 [00:00<?, ?it/s]

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7ef050f63f1bcfb167.gradio.live




In [20]:
#@title Generate on Colab

prompt = "A photograph of an Olympic bicycle race in Italy " #@param {type: "string"}
#@markdown `clip_prompt` is optional, if you leave it blank the same prompt is sent to Stable Diffusion and CLIP
clip_prompt = "" #@param {type: "string"}
num_samples = 1 #@param {type: "number"}
num_inference_steps = 50 #@param {type: "number"}
guidance_scale = 7.5 #@param {type: "number"}
clip_guidance_scale = 100 #@param {type: "number"}
num_cutouts = 4 #@param {type: "number"}
use_cutouts = "False" #@param ["False", "True"]
unfreeze_unet = "True" #@param ["False", "True"]
unfreeze_vae = "True" #@param ["False", "True"]
seed = 3788086447 #@param {type: "number"}

if unfreeze_unet == "True":
  guided_pipeline.unfreeze_unet()
else:
  guided_pipeline.freeze_unet()

if unfreeze_vae == "True":
  guided_pipeline.unfreeze_vae()
else:
  guided_pipeline.freeze_vae()

generator = torch.Generator(device="cuda").manual_seed(seed)

images = []
for i in range(num_samples):
    image = guided_pipeline(
        prompt,
        clip_prompt=clip_prompt if clip_prompt.strip() != "" else None,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        clip_guidance_scale=clip_guidance_scale,
        num_cutouts=num_cutouts,
        use_cutouts=use_cutouts == "True",
        generator=generator,
    ).images[0]
    images.append(image)

image_grid(images, 1, num_samples)

AttributeError: 'StableDiffusionPipeline' object has no attribute 'unfreeze_unet'