In [None]:
# tag list styles
# Generate image description for Stable Diffusion LoRA training in a list of general tag.
# List the lowercase tags in order of importance in explaining the image at least 10 tags.
# It is better if the color characteristics are revealed on the tag.
# (replace '_' to space and ommit the '#', do not mention background-related tags.)
# example: tag1, tag2, tag3, tag4, ....

# descriptions
# Generate image description for Stable Diffusion LoRA training
# The description must be written within 70 tokens without any further explanation

In [None]:
PROMPT_DICT = {
    "TAG" : (
        "Generate image description tags for Stable Diffusion LoRA training like Danbooru style.\n"
        "Please create a tag according to the procedure below.\n"
        "1. Please make the tags specific and concise.\n"
        "2. Avoid words with overlapping meanings.\n"
        "3. Describe the subject, style, environment, lighting, color, mood, and composition.\n"
        "4. Please write with specific numbers or collective nouns.\n\n"
        "Output must follow the format below without any further explanation.\n"
        "tag1, tag2, tag3, tag4, ...."
    ),

    "LINE" : (
        "Generate image description for Stable Diffusion 1.5v LoRA training.\n"
        "If the sentence is long, separate it with commas so that each component can be clearly seen.\n" 
        "Please write the description so that the subject, style, color, mood and composition are clearly conveyed. (Describing the composition is especially important.)\n"
        "It should be detailed and concise, avoiding words with overlapping meanings.\n"
        "Furthemore, sentence with specific numbers or collective nouns is better.\n"
        "Place the sentences you think are more important in expressing the photo forward in order.\n"
        "The description must be written within simple 70 tokens without any further explanation.\n\n"

        "ex) A cute cat with big eyes, sitting on a bench, relaxed, and yawning, ..."
    )
}
print(PROMPT_DICT['LINE'])

In [None]:
from PIL import ImageChops, Image

import base64
import requests

import io

def png2rgb(img_path):
    
    # white background
    
    rgba_image = Image.open(img_path)
    rgb_im = rgba_image.convert('RGB')
    
    background = Image.new("RGB", rgb_im.size, (255, 255, 255))
    
    # 알파 채널을 이용해 이미지를 배경과 혼합
    rgb_image = Image.alpha_composite(background.convert("RGBA"), rgba_image).convert("RGB")

    return rgb_image

def decode_img(rgb_image):
    
    # 메모리에서 파일처럼 처리하기 위해 BytesIO 객체 생성
    buffered = io.BytesIO()
    
    # 이미지를 원하는 포맷으로 저장 (예: PNG, JPEG 등)
    rgb_image.save(buffered, format="PNG")
    
    # BytesIO 객체의 내용을 base64로 인코딩
    base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
    
    return base64_image

def generate_description(base64_image, prompt_type="LINE", additional_tags=["kbank-inspired style"]):
    
    # OpenAI API Key
    api_key = ""

    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }

    payload = {
      "model": "gpt-4o",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": PROMPT_DICT[prompt_type]
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
              }
            }
          ]
        }
      ],
      "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    description = response.json()['choices'][0]['message']['content']
    
    additional_tags_str = ", ".join(additional_tags)
    
    description = additional_tags_str + ", " + description
    description = description.lower()
    description = description.replace('_', " ")
    description = description.replace('.', "")
    
    return description

In [None]:
# test sample
test_img = "sample.png"

rgb_image = png2rgb(test_img)
base64_image = decode_img(rgb_image)
description = generate_description(base64_image, prompt_type="LINE")

In [None]:
import os

img_nms = os.listdir("FULL_3D_IMAGES")
img_nms = [img_nm for img_nm in img_nms if img_nm.endswith('.png')]

In [None]:
import time
from tqdm import tqdm

new_dir = "detailed_prompt"
img_nms = os.listdir("FULL_3D_IMAGES")
img_nms = [img_nm for img_nm in img_nms if img_nm.endswith('.png')]

for img_nm in tqdm(img_nms[560:]):
    
    img_path = f"FULL_3D_IMAGES/{img_nm}"
    
    rgb_image = png2rgb(img_path)
    base64_image = decode_img(rgb_image)
    description = generate_description(base64_image, prompt_type="LINE")

    file_nm = img_nm.split(".")[0]
    rgb_image_path = f"{new_dir}/{file_nm}.jpg"
    description_path = f"{new_dir}/{file_nm}.txt"
    
    # rgb img (jpg)
    rgb_image.save(rgb_image_path)

    # write description
    with open(description_path, "w") as f:
        f.write(description)
        
    time.sleep(0.2)