Code to gen more images wrt an input image and prompt.
Source reference - Kaggle : Generation of images via Stable Diffusion

In [None]:
import os
import json
import shutil
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import pickle
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torchvision
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel
from diffusers import AutoPipelineForImage2Image
from compel import Compel

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag



In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# Download NLTK resources (you only need to do this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def foodKeywordMapping(caption="A cheesy pepperoni pizza sitting on top of a pan"):
    # Tokenize the caption into words
    words = word_tokenize(caption)
    # Perform part-of-speech tagging to identify nouns
    tagged_words = pos_tag(words)
    # Extract nouns related to food (NN: noun, NNPS: proper noun, NNS: plural noun)
    food_keywords = [word for word, tag in tagged_words if tag in ['NN', 'NNPS', 'NNS']]
    # Store the food keywords in a dictionary
    food_dict = {'food_keywords': food_keywords}
    return food_dict

print(foodKeywordMapping())

{'food_keywords': ['cheesy', 'pepperoni', 'pizza', 'top', 'pan']}


In [None]:
class args:
    batch_size = 32
    return_sequences = 3
    dataset = 'pets'
    seed = 42
    infer_steps = 100

raw_images = ".\data\coco_food\\food_images\\all_food_train2017\\000000001059.jpg"

In [None]:
pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5", 
                                                      torch_dtype=torch.float16,
                                                      variant="fp16",
                                                      safety_checker = None,
                                                      requires_safety_checker= False,
                                                      use_safetensors=True).to(device)
pipeline.enable_model_cpu_offload()
compel_proc = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder)

In [None]:
#Kaggle version
def get_input(idx, raw_images, df):
    caption = df.loc[idx, 'best_caption']
    label = class_mapping.get(raw_images[idx][1])
    guidance = df.loc[idx, 'similarity']
    guidance = -4 * guidance**2 + 2 * guidance + 1
    prompt_embeds = compel_proc(f'("{caption}", "a picture of {label}").blend(0.9, 1.5)')
    image = raw_images[idx][0]
    width, height = image.size
    image = image.resize((width * 2, height * 2))
    
    if width > 650:
        scale = int(width / 650) + 1
        new_width = width // scale
        new_height = height // scale
        image = image.resize((new_width, new_height))
    elif width < 400:
        scale = 1.5
        new_width = int(width * scale)
        new_height = int(height * scale)
        image = image.resize((new_width, new_height))
        
    return {'prompt_embeds': prompt_embeds, 
            'num_images_per_prompt' : 3,
            'num_inference_steps': args.infer_steps,
            'image' : image, 
            'guidance':guidance,
            'height': 224,
            'width' : 224,
            'noise' : 0.3
           }

In [None]:
#My improvised version with label
def get_input(idx, raw_images, df):
    caption = "A cheesy pepperoni pizza sitting on top of a pan"
    # Extract food keywords
    food_keywords = foodKeywordMapping(caption)
    # Convert the food keywords into a string separated by commas
    label = ', '.join(food_keywords)
    # Define the prompt for compel_proc with the label inserted
    prompt = f'("{caption}", "a picture of {label}").blend(0.9, 1.5)'
    # Call compel_proc with the modified prompt
    prompt_embeds = compel_proc(prompt)

    guidance = df.loc[idx, 'similarity']
    guidance = -4 * guidance**2 + 2 * guidance + 1
    
    image = raw_images[idx][0]
    width, height = image.size
    image = image.resize((width * 2, height * 2))
    
    if width > 650:
        scale = int(width / 650) + 1
        new_width = width // scale
        new_height = height // scale
        image = image.resize((new_width, new_height))
    elif width < 400:
        scale = 1.5
        new_width = int(width * scale)
        new_height = int(height * scale)
        image = image.resize((new_width, new_height))
        
    return {'prompt_embeds': prompt_embeds, 
            'num_images_per_prompt' : 3,
            'num_inference_steps': args.infer_steps,
            'image' : image, 
            'guidance':guidance,
            'height': 224,
            'width' : 224,
            'noise' : 0.3
           }

In [None]:
for ii, idx in enumerate(tqdm(range(0, 2))):
    inputs = get_input(idx, raw_images, df)
    image = pipeline(**inputs).images
    for d in range(3):
        image[d].save(f"{args.dataset}_generated_{idx}_{d}.png", format='jpg')

In [None]:
ig, axes = plt.subplots(1,4, figsize=(20, 10))

axes[0].imshow(raw_images[0][0])
axes[0].set_title('Original image')

for i in range(3):
    generated_image = Image.open(f"/kaggle/working/pets_generated_0_{i}.png")
    axes[i+1].imshow(generated_image)
    axes[i+1].set_title(f"Generated image - {i}");