In [None]:
# Install required libraries in Colab
!pip install diffusers transformers accelerate safetensors


In [None]:
# 📦 Install Dependencies (if not installed)
!pip install diffusers transformers accelerate safetensors
!pip install git+https://github.com/openai/CLIP.git

# 📥 Imports
from diffusers import StableDiffusionPipeline
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
from IPython.display import display
import numpy as np
from tqdm import tqdm

# 📌 Device Setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# 📥 Load Stable Diffusion Pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

# 📥 Load CLIP model for evaluation
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 📌 User Inputs
prompt = input("Enter your prompt: ")
guidance_scale = float(input("Enter guidance scale (5-15 recommended): "))
num_inference_steps = int(input("Enter number of inference steps (20-50 recommended): "))
num_images = int(input("How many images to generate for this prompt? "))

# 📦 Image Generation + CLIP Scoring
images = []
clip_scores = []

for i in tqdm(range(num_images)):
    with torch.autocast(device):
        image = pipe(prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps).images[0]
    images.append(image)

    # Compute CLIP Score
    inputs = clip_processor(text=prompt, images=image, return_tensors="pt", padding=True).to(device)
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image
    score = logits_per_image.item()
    clip_scores.append(score)

    display(image)
    print(f"Image {i+1} CLIP score: {score:.4f}")

# 📊 Average CLIP Score
print(f"\nAverage CLIP Score for prompt '{prompt}': {np.mean(clip_scores):.4f}")

# 📌 Optional: Latent Space Interpolation (between two prompts)
def interpolate_images(prompt1, prompt2, num_steps=5):
    print(f"\nInterpolating between:\n'{prompt1}' ↔ '{prompt2}'")
    latent_images = []
    with torch.no_grad(), torch.autocast(device):
        latents1 = pipe(prompt1, output_type="latent", num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]
        latents2 = pipe(prompt2, output_type="latent", num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).images[0]

        for i in range(num_steps):
            alpha = i / (num_steps - 1)
            latent = (1 - alpha) * latents1 + alpha * latents2
            latent_image = pipe.decode_latents(latent.unsqueeze(0))[0]
            latent_image_pil = pipe.numpy_to_pil(latent_image)[0]
            latent_images.append(latent_image_pil)
            display(latent_image_pil)
            print(f"Interpolation Step {i+1}/{num_steps} | α = {alpha:.2f}")

    return latent_images

# Uncomment to run interpolation between two prompts:
# interpolate_images("A futuristic cyberpunk city", "A medieval castle at sunset", num_steps=6)
