In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import sys
utils_path = os.path.abspath(os.path.join('../..'))
sys.path.append(utils_path)


import torch
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, DDIMScheduler, AutoencoderKL
from PIL import Image

from ip_adapter import IPAdapterPlus
from utils.dataset_info import get_subjects_prompts_info

In [6]:
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "models/image_encoder"
ip_ckpt = "models/ip-adapter-plus_sd15.bin"
device = "cuda"

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
# load SD pipeline
pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# load ip-adapter
ip_model = IPAdapterPlus(pipe, image_encoder_path, ip_ckpt, device, num_tokens=16)

Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00,  9.07it/s]


In [None]:
# Single Subject Generation
single_subject = ""                           # "backpack"
# Single Prompt Generation
single_prompt = []                            # e.g. ["a {0} {1} near the pool"]

num_generation = 4


output_path = "../../outputs/subjects/ip_adapter"
dataset_path = "../../pcs_dataset"
dataset_info_path = "../../pcs_dataset/info.json"

prompts_info = get_subjects_prompts_info(dataset_info_path)

if single_subject != "":
    prompts_info = {f"{single_subject}": prompts_info[single_subject]}

for subject in prompts_info:
        
    print(f"***** Subject: {subject} *****")

    os.makedirs(os.path.join(output_path, subject), exist_ok=True)

    if len(single_prompt):
        prompts = single_prompt
    else:
        prompts = prompts_info[subject]["prompts"]

    for prompt in prompts:
        if "a {0} {1}" in prompt:
            prompt = prompt.replace("a {0} {1}", "")
        else:
            prompt = prompt.replace("{0} {1}", prompts_info[subject]["class"])
        print(f"**Prompt**: {prompt}")

        if prompt not in os.listdir(os.path.join(output_path, subject)):

            for i in range(0, num_generation):
                cond_image = Image.open(os.path.join(dataset_path, "subjects", subject, f"0{i}.jpg"))

                image = ip_model.generate(pil_image=cond_image, num_samples=1, num_inference_steps=50, seed=42, prompt=prompt, scale=0.6)
                
                save_path = os.path.join(output_path, subject, f"{prompt}")
                os.makedirs(save_path, exist_ok=True)
                
                image[0].save(os.path.join(save_path, f"{i:04d}.jpg"))
    
    print(f"Finished textual inversion in subject: {subject}!")

## Evaluation

### Calculate the similarity for each sample
Calculate img to img similarity and text to img similarity by CLIP Evaluator

In [12]:
import os, sys, json

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

utils_path = os.path.abspath(os.path.join('../..'))
sys.path.append(utils_path)

import numpy as np
from utils.clip_eval import evaluate_i2i, evaluate_t2i

In [13]:
def convert_to_native(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    elif isinstance(data, np.generic):
        return data.item()
    elif isinstance(data, dict):
        return {key: convert_to_native(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_to_native(item) for item in data]
    else:
        return data
    
def add_evaluation(file_path, new_data):
    # check whether the file exist
    if not os.path.exists(file_path):
        with open(file_path, 'w') as file:
            json.dump(convert_to_native(new_data), file, indent=4)
    else:
        with open(file_path, 'r') as file:
            data = json.load(file)
        data.update(convert_to_native(new_data))
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)

In [None]:
outputs_path = "../../outputs/subjects/ip_adapter"
eval_res_path = "../../eval_results/subjects/ip_adapter"
dataset_path = "../../pcs_dataset/subjects"
dataset_info_path = "../../pcs_dataset/info.json"
os.makedirs(eval_res_path, exist_ok=True)

subjects_list = os.listdir(outputs_path)

for subject in subjects_list:
    evaluation_res = dict()
    print(f"***** Subject: {subject} *****")

    res_for_each_subject =dict()

    for prompt in os.listdir(os.path.join(outputs_path, subject)):
        print(f"**Prompt**: {prompt}")

        res_for_each_prompt =dict()

        for generate_img_name in os.listdir(os.path.join(outputs_path, subject, prompt)):
            generate_img_path = os.path.join(outputs_path, subject, prompt, generate_img_name)
            res_for_each_prompt[generate_img_name] = [evaluate_i2i(generate_img_path, os.path.join(dataset_path, subject)), evaluate_t2i(generate_img_path, prompt)]
        
        res_for_each_subject[prompt] = res_for_each_prompt
        print(res_for_each_prompt)

    evaluation_res[subject] = res_for_each_subject

    add_evaluation(os.path.join(eval_res_path, "evaluation_results.json"), evaluation_res)

### Calculate the average similarity

In [15]:
import json

eval_res_path = "../../eval_results/subjects/ip_adapter/evaluation_results.json"

with open(eval_res_path, "r") as f:
    data = json.load(f)

img_sim = 0.0
text_sim = 0.0
cnt = 0

for subject in data:
    for prompt in data[subject]:
        for sample in data[subject][prompt]:
            img_sim = img_sim + data[subject][prompt][sample][0]
            text_sim = text_sim + data[subject][prompt][sample][1]
            cnt = cnt + 1
print("Image Similarity: ", img_sim/cnt, "\nText Similarity:", text_sim/cnt)

Image Similarity:  0.8245813767453457 
Text Similarity: 0.2259754586727061
