In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access your OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")

In [2]:
import json
import base64
import requests
from PIL import Image
from io import BytesIO

def json_to_image(image_path: str) -> Image.Image:
    """
    Converts an image file to a PIL Image object.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - PIL.Image.Image: Image as a PIL Image object.
    """
    return Image.open(image_path)

def read_img(json_path: str) -> dict:
    """
    Reads an image path from a JSON file, converts both a constant image and the image from JSON to JPEG, and sends them to the OpenAI API.

    Parameters:
    - json_path (str): Path to the JSON file containing the image path.
    - openai_api_key (str): OpenAI API key for authentication.

    Returns:
    - dict: Response from the OpenAI API.
    """
    # Constant image path
    constant_image_path = "/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/_preview.png"

    # Load JSON data from file
    with open(json_path, 'r') as json_file:
        json_data = json.load(json_file)
    
    # Extract the image path from the JSON data
    image_path = json_data['output_path']

    # Convert image files to PIL Images
    constant_image = json_to_image(constant_image_path).convert("RGB")
    image = json_to_image(image_path).convert("RGB")

    # Convert images to base64 strings in JPEG format
    buffered1 = BytesIO()
    constant_image.save(buffered1, format="JPEG")
    base64_constant_image = base64.b64encode(buffered1.getvalue()).decode('utf-8')

    buffered2 = BytesIO()
    image.save(buffered2, format="JPEG")
    base64_image = base64.b64encode(buffered2.getvalue()).decode('utf-8')

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "compare the two image object positions. and suggest what to change in image 2 to make it like image 1"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_constant_image}"
                        }
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response_data = response.json()
    return response_data['choices'][0]['message']['content']


In [17]:
json_path = "./output/output.json"
read_img(json_path)


'To make the second image more like the first image in terms of object positioning and layout, you can make the following adjustments:\n\n1. **Title Placement:**\n   - Move the "OFFBLAK Generation T" title to the top of the image, similar to the first image.\n\n2. **Text Layout:**\n   - Place the marketing text ("Discover 12...") in the middle of the image, similar to how the "Subscribe to..." text is centered in the first image.\n\n3. **Product Presentation:**\n   - Display a broader selection of products and place them with some spacing around the middle of the image (like the tea packs and boxes in the first image).\n\n4. **Call-to-Action Button:**\n   - Add a clear "Shop Now" button at the bottom of the image, as done in the first image.\n\n5. **Supplementary Objects:**\n   - Add some lifestyle items around the products (e.g., a cup of tea, a pair of sunglasses) to create a similar vibe to the first image.\n\n6. **Hand/Icon Element:**\n   - Place an icon or an element to guide the 

In [3]:
import numpy as np
from PIL import Image, ImageEnhance
import os
import json

def get_position_from_constant(position_name, larger_image, smaller_image):
    """
    Get the (x, y) coordinates for a given position name based on the size of the larger and smaller images.

    Parameters:
    - position_name (str): Name of the position (e.g., "top right").
    - larger_image (PIL.Image.Image): The larger image.
    - smaller_image (PIL.Image.Image): The smaller image.

    Returns:
    - tuple: (x, y) coordinates for the given position.
    """
    larger_w, larger_h = larger_image.size
    smaller_w, smaller_h = smaller_image.size

    positions = {
        "top left": (0, 0),
        "top center": ((larger_w - smaller_w) // 2, 0),
        "top right": (larger_w - smaller_w, 0),
        "center left": (0, (larger_h - smaller_h) // 2),
        "center center": ((larger_w - smaller_w) // 2, (larger_h - smaller_h) // 2),
        "center right": (larger_w - smaller_w, (larger_h - smaller_h) // 2),
        "bottom left": (0, larger_h - smaller_h),
        "bottom center": ((larger_w - smaller_w) // 2, larger_h - smaller_h),
        "bottom right": (larger_w - smaller_w, larger_h - smaller_h),
    }

    return positions.get(position_name, (0, 0))

def blend_images(position_names: list, alpha: float = 0.5, output_dir: str = './output') -> str:
    """
    Blends multiple small images by placing them on top of a larger image at specified positions without overlapping and saves the blended image.

    Parameters:
    - position_names (list of str): List of position names for each small image, e.g., ["top left", "top right", "center center"].
    - alpha (float): Blending factor for transparency (0.0 to 1.0). Default is 0.5.
    - output_dir (str): Directory where the blended image will be saved. Default is './output'.

    Returns:
    - str: Path to the JSON file containing the output image information.
    """
    # Define paths to small images (example paths)
    small_images_paths = [
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/cta.jpg',
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/discover.png',
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/endframe_3.png',
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/engagement_animation_1.png',
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/engagement_instruction_1.png',
        '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/landing_endframe.jpg'
    ]

    if len(small_images_paths) != len(position_names):
        raise ValueError("The number of small images must match the number of position names.")

    # Load larger image from file path (example path)
    larger_image_path = '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/endframe_1.jpg'
    larger_image = Image.open(larger_image_path)
    larger_image = larger_image.convert("RGBA")  # Ensure the larger image has an alpha channel

    # Create a blank image with the same size as the larger image
    blended_image = Image.new("RGBA", larger_image.size)

    # Paste the larger image onto the blank image
    blended_image.paste(larger_image, (0, 0))

    # Blend each small image at its respective position
    for i, small_image_path in enumerate(small_images_paths):
        # Load small image from file path
        smaller_image = Image.open(small_image_path)
        smaller_image = smaller_image.convert("RGBA")  # Ensure the smaller image has an alpha channel

        # Resize smaller image if necessary
        smaller_w, smaller_h = smaller_image.size
        larger_w, larger_h = larger_image.size
        if smaller_h > larger_h or smaller_w > larger_w:
            aspect_ratio = smaller_w / smaller_h
            if smaller_h > larger_h:
                smaller_h = larger_h
                smaller_w = int(smaller_h * aspect_ratio)
            if smaller_w > larger_w:
                smaller_w = larger_w
                smaller_h = int(smaller_w / aspect_ratio)
            smaller_image = smaller_image.resize((smaller_w, smaller_h))

        # Make the smaller image semi-transparent
        enhancer = ImageEnhance.Brightness(smaller_image)
        smaller_image = enhancer.enhance(alpha)

        # Get position for the smaller image
        x_offset, y_offset = get_position_from_constant(position_names[i], larger_image, smaller_image)

        # Paste the smaller image onto the blended image at the calculated position
        blended_image.paste(smaller_image, (x_offset, y_offset), smaller_image)

    # Save blended image to output directory
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'blended_image.png')
    blended_image.save(output_file)

    # Create JSON response
    json_data = {
        'output_path': output_file,
        'positions': position_names,
        'alpha': alpha
    }

    # Write JSON to file
    json_output_file = os.path.join(output_dir, 'output.json')
    with open(json_output_file, 'w') as f:
        json.dump(json_data, f, indent=4)

    return json_output_file


In [14]:
positions_list =  ["bottom center", "top right", "top left","center center", "bottom left", "top centert"]

# Call the blend_images function
blend_images(positions_list, alpha=0.5, output_dir='./output')

'./output/output.json'

In [34]:
type(positions_list)

list

In [4]:
llm_config2 = {"config_list": [{"model": "gpt-4", "api_key": openai_api_key}]}
code_execution_config = {"use_docker": False}

# Initialize the assistant agent with the given configurations
config_list = [
    {"model": "gpt-4", "api_key": openai_api_key, "api_type": "openai"},
]

In [5]:
llm_config={
        "temperature": 0,
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
        "functions": [
             {
                        "name": "blend_images",
                        "description": "use this function to blend the images",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "position_names": {
                                    "type": "string",
                                    "description": "This is where you will position the blending"
                                },
                            },
                            "required": ["positions_str"]
                        }
                        },
             {
                        "name": "read_img",
                        "description": "use this to read the image blended from blended image",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "output_json": {
                                    "type": "object",
                                    "description": "This is the blended image"
                                },
                            },
                            "required": ["position"]
                        }
                        }
                        ],
}


In [26]:
import os
import autogen

from autogen import ConversableAgent

# Let's first define the assistant agent that suggests tool calls.
img_blend_assistant = autogen.AssistantAgent(
    name="image_blending_assistant",
    code_execution_config=False,
    system_message=
    
"""You are a helpful AI assistant. 
The main problems you will be solving include:
- suggest diffrent "positions" to make a good advertising based on the feedback from 'img_critic_assistan'
    use this as example: "List of position names for each small image, e.g., ["top left", "top right", "center center", "bottom left", "bottom center", "bottom right"]"
    make sure you are giving only 6 positions
    make sure that the images will not overlap
- This are the discription of the pictures:
    1. the first position is for a picture that have a text saying 'shop now '
    2. the second position is for a picture that have a text saying 'Discover 12 unique tea flavours delivered to your door'
    3. the third position is for a picture that have a text saying 'Enjoy tea delivered to your home'
    4. the fourth position is for a picture that shows a hand pointing
    5. the fifth position is for a picture that have a text saying 'tap to get letter box delivery of tea'
    6. the sixth position is for a picture that have a text saying 'off black generation picture'

- your task:
    - considering the above discription for each picture find a way to position each picture to give a good advertaizing 
    based on the recomendation you got from 'img_critic_assistant'
    - 'TERMINATE' when the image you blend looks like the feedback.
    """
    ,
    llm_config=llm_config,
    function_map={
        "blend_images": blend_images,
        "read_img": read_img
    }
        
)

img_critic_assistant = autogen.AssistantAgent(
    name="image_blending_assistant",
    code_execution_config=False,
    system_message="You are a advertizing image critic AI assistant. "
    """You task is to critic the 'output.json' from 'img_blend_assistant'
    critic only the following metrics: 
  
                            - the first position is for a picture that have a text saying 'shop now '
                            - the second position is for a picture that have a text saying 'Discover 12 unique tea flavours delivered to your door'
                            - the third position is for a picture that have a text saying 'Enjoy tea delivered to your home'
                            - the fourth position is for a picture that shows a hand pointing
                            - the fifth position is for a picture that have a text saying 'tap to get letter box delivery of tea'
                            - the sixth position is for a picture that have a text saying 'off black generation picture'
    recommend 'img_blend_assistant' for a better advertising by comparing to image 1 which is a good advertisement on the above metrics.
    "Return 'TERMINATE' when the task is done.""",
    llm_config=llm_config,
    function_map={
        "blend_images": blend_images,
        "read_img": read_img
    }
        
)




# The user proxy agent is used for interacting with the assistant agent
# and executes tool calls.
def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    system_message="Executor. Execute the functions recommended by the assistants.",
    is_termination_msg=termination_msg,
    human_input_mode="NEVER",
    code_execution_config=False
    # # is_termination_msg=lambda x: "content" in x and x["content"] is not None and x["content"].rstrip().endswith("TERMINATE"),
    # # code_execution_config={"work_dir": "planning"},
    # function_map={"blend_images": blend_images},
    # code_execution_config=False
)

def state_transition(last_speaker, groupchat):
    messages = groupchat.messages
    if last_speaker is user_proxy:
        # If user_proxy was last, we decide whether to start blending or critiquing
        return img_blend_assistant

    elif last_speaker is img_blend_assistant:
        # After img_blend_assistant, we need user_proxy to execute some code
        # Therefore, the next state should be user_proxy
        return img_critic_assistant

    elif last_speaker is img_critic_assistant:
        # After img_critic_assistant, we need user_proxy to execute some code
        # Therefore, the next state should be user_proxy
        return img_blend_assistant

    # elif last_speaker is user_proxy and 'img_blend_assistant' in messages[-1]["content"]:
    #     # If user_proxy just executed code for img_blend_assistant, next is img_critic_assistant
    #     return img_critic_assistant

    # elif last_speaker is user_proxy and 'img_critic_assistant' in messages[-1]["content"]:
    #     # If user_proxy just executed code for img_critic_assistant, next is img_blend_assistant
    #     return img_blend_assistant


groupchat = autogen.GroupChat(
    agents=[user_proxy, img_blend_assistant, img_critic_assistant],
 
    messages=[],  # The initial messages in the chat
    max_round=5,  # Maximum rounds of conversation
    speaker_selection_method=state_transition,
)

manager = autogen.GroupChatManager(
    groupchat=groupchat,
    llm_config=llm_config2
)



In [23]:
message = user_proxy.initiate_chat(
    manager, message="blend the images at diffrent positions")


[33mimage_blending_assistant[0m (to image_blending_assistant):

blend the images at diffrent positions

--------------------------------------------------------------------------------
[33mimage_blending_assistant[0m (to image_blending_assistant):

[32m***** Suggested function call: blend_images *****[0m
Arguments: 
{
  "position_names": "first, second, third, fourth, fifth, sixth"
}
[32m*************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION blend_images...[0m
[33mimage_blending_assistant[0m (to image_blending_assistant):

[32m***** Response from calling function (blend_images) *****[0m
Error: The number of small images must match the number of position names.
[32m*********************************************************[0m

--------------------------------------------------------------------------------
[33mimage_blending_assistant[0m (to image_blending_as

KeyboardInterrupt: 

In [58]:
chat = message.chat_history
chat[-1]['Response from calling function (blend_images)']

KeyError: 'Response from calling function (blend_images)'

In [59]:
message

ChatResult(chat_id=None, chat_history=[{'content': 'blend the images at diffrent positions', 'role': 'assistant'}, {'content': '', 'function_call': {'arguments': '{\n  "position_names": ["top left", "top right", "center center", "bottom left", "bottom center", "bottom right"]\n}', 'name': 'blend_images'}, 'name': 'image_blending_assistant', 'role': 'assistant'}, {'content': './output/output.json', 'name': 'blend_images', 'role': 'function'}, {'content': "The images have been blended successfully. Now, let's read the blended image to provide a critique.", 'function_call': {'arguments': '{}', 'name': 'read_img'}, 'name': 'image_blending_assistant', 'role': 'assistant'}, {'content': "Error: read_img() missing 1 required positional argument: 'json_path'", 'name': 'read_img', 'role': 'function'}, {'content': "Apologies for the confusion. Let's try reading the image again.", 'function_call': {'arguments': '{\n  "json_path": "./output/output.json"\n}', 'name': 'read_img'}, 'name': 'image_blen

In [10]:
import os
import autogen
from autogen import ConversableAgent

# Define the assistant agent that suggests tool calls.
img_blend_assistant = autogen.AssistantAgent(
    name="img_blend_assistant",
    code_execution_config=False,
    system_message="""You are a helpful AI assistant. 
The main problems you will be solving include:
- suggest different "positions" to make a good advertising based on the feedback from 'img_critic_assistant'
    use this as example: "List of position names for each small image, e.g., ["top left", "top right", "center center", "bottom left", "bottom center", "bottom right"]"
    make sure you are giving only 6 positions
    make sure that the images will not overlap
- picture positions: ['shop now','Discover 12 unique tea flavours delivered to your door', 'Enjoy tea delivered to your home', hand pointing, 'tap to get letter box delivery of tea','off black generation picture']
- Your task:
    - Considering the above descriptions for each picture, find a way to position each picture to give good advertising based on the recommendation you got from 'img_critic_assistant'.
    - 'TERMINATE' when the image you blend looks like the feedback.
    """,
    llm_config=llm_config2
)

img_critic_assistant = autogen.AssistantAgent(
    name="img_critic_assistant",
    code_execution_config=False,
    system_message="""You are an advertising image critic AI assistant. 
Your task is to critique the 'output.json' from 'img_blend_assistant'.
positions: ['shop now','Discover 12 unique tea flavours delivered to your door', 'Enjoy tea delivered to your home', hand pointing, 'tap to get letter box delivery of tea','off black generation picture']
Recommend 'img_blend_assistant' for better advertising by comparing it to image 1, which is a good advertisement on the above metrics.
Return 'TERMINATE' when the task is done.""",
    llm_config=llm_config2
)

# The user proxy agent is used for interacting with the assistant agent and executes tool calls.
def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()

user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    system_message="Executor. Execute the functions recommended by the assistants.",
    is_termination_msg=termination_msg,
    human_input_mode="NEVER",
    code_execution_config=False
)

# Register functions for execution
img_blend_assistant.register_for_llm(name="blend_images", description="Image blender")(blend_images)
img_critic_assistant.register_for_llm(name="read_img", description="Image reader")(read_img) 
user_proxy.register_for_execution(name="blend_images")(blend_images)
user_proxy.register_for_execution(name="read_img")(read_img)

# Create group chat
groupchat = autogen.GroupChat(
    agents=[user_proxy, img_blend_assistant, img_critic_assistant],
    messages=[],  # The initial messages in the chat
    max_round=15  # Maximum rounds of conversation
)

# Create group chat manager
manager = autogen.GroupChatManager(
    groupchat=groupchat,
    llm_config=llm_config2
)


In [11]:
chat_result = user_proxy.initiate_chat(manager, message="blend the images at diffrent positions")

[33muser_proxy[0m (to chat_manager):

blend the images at diffrent positions

--------------------------------------------------------------------------------
[32m
Next speaker: img_blend_assistant
[0m
[33mimg_blend_assistant[0m (to chat_manager):

Sure, let's find suitable positions for the images while considering that they won't overlap:

1. 'shop now' - this is usually a call to action, it would usually be bottom center or top right. Let's place it at "bottom center".

2. 'Discover 12 unique tea flavours delivered to your door' - This is a key selling point, so it should be prominent but not overpowering. I'd suggest placing it at "top right".

3. 'Enjoy tea delivered to your home' - This is reinforcing the convenience of the service, so we can place it at  "top left".

4. 'hand pointing' - The hand pointing usually directs attention to something important, so its position would depend on what it's pointing at. If it's pointing at 'shop now' then it could be at 'bottom right'

In [None]:
import os
import autogen

from autogen import ConversableAgent


# Assistant agent for blending images
img_blend_assistant = autogen.AssistantAgent(
    name="image_blending_assistant",
    code_execution_config=False,
    system_message="""You are a helpful AI assistant. 
The main problems you will be solving include:
- Suggest different "positions" to make a good advertisement based on the feedback from 'img_critic_assistant'
  Use this as example: "List of position names for each small image, e.g., ["top left", "top right", "center center", "bottom left", "bottom center", "bottom right"]"
  Make sure you are giving only 6 positions
  Make sure that the images will not overlap
- This is the description of the pictures:
  1. The first position is for a picture that has a text saying 'shop now'
  2. The second position is for a picture that has a text saying 'Discover 12 unique tea flavours delivered to your door'
  3. The third position is for a picture that has a text saying 'Enjoy tea delivered to your home'
  4. The fourth position is for a picture that shows a hand pointing
  5. The fifth position is for a picture that has a text saying 'tap to get letter box delivery of tea'
  6. The sixth position is for a picture that has a text saying 'off black generation picture'

- Your task:
  - Considering the above description for each picture, find a way to position each picture to give a good advertisement based on the recommendation you got from 'img_critic_assistant'
  - 'TERMINATE' when the image you blend looks like the feedback.
""",
    llm_config=llm_config2,
)

# Assistant agent for criticizing the blended image
img_critic_assistant = autogen.AssistantAgent(
    name="image_critic_assistant",
    code_execution_config=False,
    system_message="""You are an advertising image critic AI assistant. 
Your task is to criticize the 'output.json' from 'img_blend_assistant' based on the following metrics:
- The first position is for a picture that has a text saying 'shop now'
- The second position is for a picture that has a text saying 'Discover 12 unique tea flavours delivered to your door'
- The third position is for a picture that has a text saying 'Enjoy tea delivered to your home'
- The fourth position is for a picture that shows a hand pointing
- The fifth position is for a picture that has a text saying 'tap to get letter box delivery of tea'
- The sixth position is for a picture that has a text saying 'off black generation picture'
Recommend 'img_blend_assistant' for better advertising by comparing to image 1 which is a good advertisement on the above metrics.
Return 'TERMINATE' when the task is done.
""",
    llm_config=llm_config2,
)

# Define the termination message check
def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()

# User proxy agent for executing functions
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    system_message="Executor. Execute the functions recommended by the assistants.",
    is_termination_msg=termination_msg,
    human_input_mode="NEVER",
    code_execution_config=False
)

# Assuming blend_images and read_img are defined elsewhere or imported
img_blend_assistant.register_for_llm(name="blend_images", description="Image blender")(blend_images)
img_critic_assistant.register_for_llm(name="read_img", description="Image reader")(read_img)
user_proxy.register_for_execution(name="blend_images")(blend_images)
user_proxy.register_for_execution(name="read_img")(read_img)

# Group chat setup
groupchat = autogen.GroupChat(
    agents=[user_proxy, img_blend_assistant, img_critic_assistant],
    messages=[],  # The initial messages in the chat
    max_round=10,  # Maximum rounds of conversation
)

# Group chat manager setup
manager = autogen.GroupChatManager(
    groupchat=groupchat,
    llm_config=llm_config2
)
