In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip install datasets transformers accelerate torchmetrics[image] diffusers vendi_score vendi_score[images] vendi_score[text,molecules] vendi_score[all]



In [None]:
import os
import json
import random
import numpy as np
import torch

from diffusers import StableDiffusionPipeline
from torch import nn
from torchvision.models import inception_v3, Inception_V3_Weights
from torchvision import transforms
from vendi_score import image_utils, data_utils, vendi
from PIL import Image

import warnings
warnings.filterwarnings("ignore", module="torchvision")

In [None]:
repo_ids = ["runwayml/stable-diffusion-v1-5", "stabilityai/stable-diffusion-xl-base-1.0"]
sd_pipeline = StableDiffusionPipeline.from_pretrained(repo_ids[0], torch_dtype=torch.float16, use_safetensors=True).to("cuda")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
configurations = ["d-xl_images", "d-xl_color-prompt_images", "d-xl_color-direction-prompt_images", "d-xl_color-texture-prompt_images", "d-color-texture_images", "d_color-texture-feather_images"]
configuration = configurations[5]
prompts = [
    """A detailed, high-resolution image of a blue bulbul.""",
    """A detailed, high-resolution image of a blue bulbul, perched elegantly on a sturdy branch of a lush, green tree.""",
    """A detailed, high-resolution image of a blue bulbul, perched elegantly on a sturdy branch of a lush, green tree.
    The bulbul's feathers are covered in rain droplets, reflecting the surrounding environment.""",
    """A detailed, high-resolution image of a blue bulbul, perched elegantly on a sturdy branch of a lush, green tree.
    The bulbul's feathers are covered in rain droplets, reflecting the surrounding environment, adding depth to the scene.""",
]
configs = [
    "d_res-color",
    "d_res-color-environment",
    "d_res-color-environment-texture",
    "d_res-color-environment-texture-description"
]
totals = [
    50,
    500
]

# Generate images

In [None]:
import os

for prompt, config in zip(prompts, configs):
  if config != configs[-1]:
    continue

  total = totals[1]
  # for total in totals:
  for i in range(0, total, 5):
    images = sd_pipeline(
        prompt = prompt,
        num_images_per_prompt = 5,
    ).images

    os.makedirs(f"/content/drive/MyDrive/cv/{config}_{str(total)}", exist_ok=True)
    for j in range(0, 5, 1):
      index = i + j
      images[j].save(f"/content/drive/MyDrive/cv/{config}_{str(total)}/image{index}.png")

## Define Inception Embedding Model

## Sanity check

In [None]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
device = "cuda"

def load_inception_v3():
  model = inception_v3(weights=Inception_V3_Weights.DEFAULT, transform_input=True).eval()
  # model.fc = torch.nn.Identity()
  return model


def inception_transforms():
  return transforms.Compose(
    [
      transforms.Resize(299),
      transforms.CenterCrop(299),
      transforms.ToTensor(),
      transforms.Lambda(lambda x: x.expand(3, -1, -1)),
    ]
  )


def get_images_embeddings(images, random_indices, no_repetitions, batch_size=64, device=torch.device("cpu")):
    if type(device) == str:
        device = torch.device(device)

    model = load_inception_v3().to(device)
    transform = inception_transforms()

    uids = []
    embeddings = []
    for index, batch in enumerate(data_utils.to_batches(images, batch_size)):
        if index == 0:
          x = torch.stack([transform(img) for img in batch for i in range(no_repetitions)], 0).to(device)
        else:
          x = torch.stack([transform(img) for img in batch], 0).to(device)

        with torch.no_grad():
            output = model(x).to(device)

        if type(output) == list:
            output = output[0]

        embeddings.append(output.squeeze().cpu().numpy())
    return np.concatenate(embeddings, 0)


def get_pixel_images(images, random_indices, no_repetitions):
  imgs = []
  for k, image in enumerate(images):
    if k == 0:
      imgs.extend([image for _ in range(no_repetitions)])
    else:
      imgs.append(image)
  return imgs


vendi_scores = []
total = totals[0]
for l, configuration in enumerate(configs):
  print(f"Configuration: {configuration}")
  config_scores = []

  image_names = os.listdir(f'/content/drive/MyDrive/cv/{configuration}_{total}/')
  all_images = [Image.open(f"/content/drive/MyDrive/cv/{configuration}_{total}/{img_name}") for img_name in image_names]
  for no_samples in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    for no_reps in [1, 5, 10, 15, 50]:
      random_indices = random.sample(sorted(np.arange(50)), no_samples)

      images = random.sample(all_images, no_samples)

      embeddings = get_images_embeddings(images, random_indices, no_reps, batch_size=5, device=torch.device("cuda"))
      inception_vs = vendi.score_X(embeddings)

      pixel_images = get_pixel_images(images, random_indices, no_reps)
      pixel_vs = image_utils.pixel_vendi_score(pixel_images)

      print(f"{no_samples} samples {no_reps} repetitions")
      print(f"inception_vs: {inception_vs}, pixel_vs: {pixel_vs}")
      config_scores.append({
          "samples": no_samples,
          "repetitions": no_reps,
          "scores": {"inception_vs": str(inception_vs), "pixel_vs": str(pixel_vs)}
      })

  vendi_scores.append({
      "configuration": configuration,
      "config_scores": config_scores
  })

  with open(f"/content/drive/MyDrive/cv/vs_all-configs.jsonl", "w") as jsonfile:
    json.dump(vendi_scores, jsonfile)

In [None]:
def get_inception(pretrained=True, pool=True):
    if pretrained:
        weights = Inception_V3_Weights.DEFAULT
    else:
        weights = None
    model = inception_v3(
        weights=weights, transform_input=True
    ).eval()
    if pool:
        model.fc = nn.Identity()
    return model


def inception_transforms():
    return transforms.Compose(
        [
            transforms.Resize(299),
            transforms.CenterCrop(299),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x.expand(3, -1, -1)),
        ]
    )

def get_embeddings(
    images,
    no_repetitions=None,
    model=None,
    transform=None,
    batch_size=64,
    device=torch.device("cpu"),
):
    if type(device) == str:
        device = torch.device(device)
    if model is None:
        model = get_inception(pretrained=True, pool=True).to(device)
        transform = inception_transforms()
    if transform is None:
        transform = transforms.ToTensor()
    uids = []
    embeddings = []
    for batch_id, batch in enumerate(data_utils.to_batches(images, batch_size)):
        x = [torch.stack([transform(img) for img in batch], 0).to(device)]
        if batch_id == 0 and no_repetitions > 1:
          first_image_repeated = torch.stack([transform(batch[0]) for _ in range(no_repetitions-1)], 0).to(device)
          x.extend([first_image_repeated])
        x = torch.cat(x, 0)
        with torch.no_grad():
            output = model(x)
        if type(output) == list:
            output = output[0]
        embeddings.append(output.squeeze().cpu().numpy())
    return np.concatenate(embeddings, 0)

def embedding_vendi_score(
    images, no_repetitions, batch_size=64, device="cpu", model=None, transform=None
):
    X = get_embeddings(
        images,
        no_repetitions,
        batch_size=batch_size,
        device=device,
        model=model,
        transform=transform,
    )
    n, d = X.shape
    if n < d:
      return vendi.score_X(X)
    return vendi.score_dual(X)


def pixel_vendi_score(images, random_indices, no_repetitions):
  imgs = []
  for k, image in enumerate(images):
    if k == 0 and no_repetitions > 1:
      imgs.extend([image for _ in range(no_repetitions)])
    else:
      imgs.append(image)
  return image_utils.pixel_vendi_score(imgs)

In [None]:
total = totals[0]
vendi_scores = []
for l, configuration in enumerate(configs):
  print(f"Configuration: {configuration}")
  config_scores = []
  image_names = os.listdir(f'/content/drive/MyDrive/cv/{configuration}_{total}/')
  all_images = [Image.open(f"/content/drive/MyDrive/cv/{configuration}_{total}/{img_name}") for img_name in image_names]
  for no_samples in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    for no_reps in [1, 5, 10, 15, 50]:
      random_indices = random.sample(sorted(np.arange(50)), no_samples)
      images = random.sample(all_images, no_samples)
      inception_vs = embedding_vendi_score(images, no_reps, batch_size=64, device="cuda")
      pixel_vs = pixel_vendi_score(images, random_indices, no_reps)

      print(f"{no_samples} samples {no_reps} repetitions")
      print(f"inception_vs: {inception_vs}, pixel_vs: {pixel_vs}")
      config_scores.append({
          "samples": no_samples,
          "repetitions": no_reps,
          "scores": {"inception_vs": str(inception_vs), "pixel_vs": str(pixel_vs)}
      })

  vendi_scores.append({
      "configuration": configuration,
      "config_scores": config_scores
  })

  with open(f"/content/drive/MyDrive/cv/i_vs-p_vs-all_configs-2.jsonl", "w") as jsonfile:
    json.dump(vendi_scores, jsonfile)

Configuration: d_res-color
5 samples 1 repetitions
inception_vs: 1.6030148267745972, pixel_vs: 1.8105870937200514
5 samples 5 repetitions
inception_vs: 1.541734218597412, pixel_vs: 1.5900579701736932
5 samples 10 repetitions
inception_vs: 1.304497241973877, pixel_vs: 1.3661673251219757
5 samples 15 repetitions
inception_vs: 1.243733286857605, pixel_vs: 1.2354108223531883
5 samples 50 repetitions
inception_vs: 1.0923999547958374, pixel_vs: 1.1264660755229317
10 samples 1 repetitions
inception_vs: 2.016962766647339, pixel_vs: 1.8368403827505926
10 samples 5 repetitions
inception_vs: 1.8646939992904663, pixel_vs: 1.7056558370560717
10 samples 10 repetitions
inception_vs: 1.7855262756347656, pixel_vs: 1.6155841028538716
10 samples 15 repetitions
inception_vs: 1.5389010906219482, pixel_vs: 1.5111085519213243
10 samples 50 repetitions
inception_vs: 1.257485270500183, pixel_vs: 1.2261691630976679
15 samples 1 repetitions
inception_vs: 2.090667963027954, pixel_vs: 2.022248048094092
15 samples 

In [None]:
total = totals[0]
vendi_scores = []
for l, configuration in enumerate(configs):
  print(f"Configuration: {configuration}")
  config_scores = []
  image_names = os.listdir(f'/content/drive/MyDrive/cv/{configuration}_{total}/')
  all_images = [Image.open(f"/content/drive/MyDrive/cv/{configuration}_{total}/{img_name}") for img_name in image_names]
  for no_samples in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
    for no_reps in [1]:
      random_indices = random.sample(sorted(np.arange(50)), no_samples)
      images = random.sample(all_images, no_samples)
      inception_vs = embedding_vendi_score(images, no_reps, batch_size=64, device="cuda")
      pixel_vs = pixel_vendi_score(images, random_indices, no_reps)

      print(f"{no_samples} samples {no_reps} repetitions")
      print(f"inception_vs: {inception_vs}, pixel_vs: {pixel_vs}")
      config_scores.append({
          "samples": no_samples,
          "repetitions": no_reps,
          "scores": {"inception_vs": str(inception_vs), "pixel_vs": str(pixel_vs)}
      })

  vendi_scores.append({
      "configuration": configuration,
      "config_scores": config_scores
  })

  with open(f"/content/drive/MyDrive/cv/i_vs-p_vs-all_configs-3.jsonl", "w") as jsonfile:
    json.dump(vendi_scores, jsonfile)

Configuration: d_res-color
5 samples 1 repetitions
inception_vs: 1.6457939147949219, pixel_vs: 1.62077871699863
10 samples 1 repetitions
inception_vs: 1.9574440717697144, pixel_vs: 2.0534624725828885
15 samples 1 repetitions
inception_vs: 1.904537558555603, pixel_vs: 1.9494735269210823
20 samples 1 repetitions
inception_vs: 2.1980621814727783, pixel_vs: 1.9734183869516897
25 samples 1 repetitions
inception_vs: 2.259702205657959, pixel_vs: 2.1665055234257133
30 samples 1 repetitions
inception_vs: 2.234273672103882, pixel_vs: 2.116802627712444
35 samples 1 repetitions
inception_vs: 2.3073694705963135, pixel_vs: 2.2134627124512063
40 samples 1 repetitions
inception_vs: 2.3404839038848877, pixel_vs: 2.2386366991385556
45 samples 1 repetitions
inception_vs: 2.3809893131256104, pixel_vs: 2.335574460945557
50 samples 1 repetitions
inception_vs: 2.4047281742095947, pixel_vs: 2.314508143188248
Configuration: d_res-color-environment
5 samples 1 repetitions
inception_vs: 1.7544032335281372, pixel

In [None]:
total = totals[1]
vendi_scores = []
for l, configuration in enumerate(configs):
  if configuration == configs[-1]:
    print(f"Configuration: {configuration}")
    config_scores = []
    image_names = os.listdir(f'/content/drive/MyDrive/cv/{configuration}_{total}/')
    all_images = [Image.open(f"/content/drive/MyDrive/cv/{configuration}_{total}/{img_name}") for img_name in image_names]
    for no_samples in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 500]:
      for no_reps in [1]:
        images = random.sample(all_images, no_samples)
        inception_vs = embedding_vendi_score(images, no_reps, batch_size=64, device="cuda")
        pixel_vs = pixel_vendi_score(images, random_indices, no_reps)

        print(f"{no_samples} samples {no_reps} repetitions")
        print(f"inception_vs: {inception_vs}, pixel_vs: {pixel_vs}")
        config_scores.append({
            "samples": no_samples,
            "repetitions": no_reps,
            "scores": {"inception_vs": str(inception_vs), "pixel_vs": str(pixel_vs)}
        })

    vendi_scores.append({
        "configuration": configuration,
        "config_scores": config_scores
    })

    with open(f"/content/drive/MyDrive/cv/i_vs-p_vs-all_configs-4.jsonl", "w") as jsonfile:
      json.dump(vendi_scores, jsonfile)

Configuration: d_res-color-environment-texture-description
5 samples 1 repetitions
inception_vs: 1.7365570068359375, pixel_vs: 1.7863884245786255
10 samples 1 repetitions
inception_vs: 1.9061369895935059, pixel_vs: 1.9660638416973386
15 samples 1 repetitions
inception_vs: 2.3202710151672363, pixel_vs: 2.1729437518305232
20 samples 1 repetitions
inception_vs: 2.4356980323791504, pixel_vs: 2.3547579552014253
25 samples 1 repetitions
inception_vs: 2.1505815982818604, pixel_vs: 2.3074337637473423
30 samples 1 repetitions
inception_vs: 2.457899808883667, pixel_vs: 2.4936577610164425
35 samples 1 repetitions
inception_vs: 2.4613442420959473, pixel_vs: 2.4852009734468146
40 samples 1 repetitions
inception_vs: 2.4764363765716553, pixel_vs: 2.5305571252917876
45 samples 1 repetitions
inception_vs: 2.496657371520996, pixel_vs: 2.5052282176822316
50 samples 1 repetitions
inception_vs: 2.353614091873169, pixel_vs: 2.5277263684897893
500 samples 1 repetitions
inception_vs: 2.9232311248779297, pixel