In [1]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.2.52 🚀 Python-3.8.10 torch-2.3.1 CPU (aarch64)
Setup complete ✅ (16 CPUs, 30.8 GB RAM, 42.4/193.6 GB disk)


In [4]:
!yolo predict model=yolov8n.pt source='/home/jabez_kassa/week_12/Semantic-Image-and-Text-Alignment/data/Assets/0a22f881b77f00220f2034c21a18b854/rev-thumbnail-mpu.jpg'

Ultralytics YOLOv8.2.52 🚀 Python-3.8.10 torch-2.3.1 CPU (aarch64)
YOLOv8n summary (fused): 168 layers, 3151904 parameters, 0 gradients, 8.7 GFLOPs

image 1/1 /home/jabez_kassa/week_12/Semantic-Image-and-Text-Alignment/data/Assets/0a22f881b77f00220f2034c21a18b854/rev-thumbnail-mpu.jpg: 320x640 4 persons, 3 cars, 170.9ms
Speed: 2.5ms preprocess, 170.9ms inference, 1.1ms postprocess per image at shape (1, 3, 320, 640)
Results saved to [1m/home/jabez_kassa/week_12/Semantic-Image-and-Text-Alignment/runs/detect/predict3[0m
💡 Learn more at https://docs.ultralytics.com/modes/predict


In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access your OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
import json
import base64
import requests
from PIL import Image
from io import BytesIO

def encode_image(image_path: str) -> str:
    """
    Encodes an image to a base64 string.

    Parameters:
    - image_path (str): Path to the input image.

    Returns:
    - str: Base64-encoded image data.
    """
    with open(image_path, 'rb') as image_file:
        image_data = image_file.read()
    return base64.b64encode(image_data).decode('utf-8')

def json_to_image(image_path: str) -> Image.Image:
    """
    Converts an image file to a PIL Image object.

    Parameters:
    - image_path (str): Path to the image file.

    Returns:
    - PIL.Image.Image: Image as a PIL Image object.
    """
    # Open the image file
    image = Image.open(image_path)
    
    return image

def read_img(json_path: str) -> dict:
    """
    Reads an image path from a JSON file, converts the image to JPEG, and sends it to the OpenAI API.

    Parameters:
    - json_path (str): Path to the JSON file containing the image path.
    - openai_api_key (str): OpenAI API key for authentication.

    Returns:
    - dict: Response from the OpenAI API.
    """
    # Load JSON data from file
    with open(json_path, 'r') as json_file:
        json_data = json.load(json_file)
    
    # Extract the image path from the JSON data
    image_path = json_data['output_path']
    
    # Convert image file to PIL Image
    image = json_to_image(image_path)
    
    # Convert image to base64 string in JPEG format
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "rate this advertising 1 to 10. And comment it."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response_data =  response.json()
    return response_data['choices'][0]['message']['content']

In [3]:
json_path = "./output/output.json"
read_img(json_path)


In [8]:
import numpy as np
from PIL import Image
import os
import json

def blend_images(position: str = 'center', alpha: float = 0.5, output_dir: str = './output') -> str:
    """
    Blends two images by placing the smaller image on top of the larger image at a specified position and saves the blended image.

    Parameters:
    - position (str): Position where the smaller image will be placed on the larger image.
                      Options are 'center', 'top-left', 'top-right', 'bottom-left', 'bottom-right'. Default is 'center'.
    - alpha (float): Blending factor for transparency (0.0 to 1.0). Default is 0.5.
    - output_dir (str): Directory where the blended image will be saved. Default is './output'.

    Returns:
    - str: Path to the JSON file containing the output image information.
    """
    larger_image_path = '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/endframe_1.jpg'
    smaller_image_path = '/home/jabez_kassa/week_12_updated/Semantic-Image-and-Text-Alignment/data/Assets/015efcdd8de3698ffc4dad6dabd6664a/cta.jpg'

    # Load images from file paths
    larger_image = Image.open(larger_image_path)
    smaller_image = Image.open(smaller_image_path)

    # Convert images to NumPy arrays
    larger_image_np = np.array(larger_image)
    smaller_image_np = np.array(smaller_image)

    # Get dimensions
    larger_h, larger_w = larger_image_np.shape[:2]
    smaller_h, smaller_w = smaller_image_np.shape[:2]

    # Resize smaller image if necessary
    if smaller_h > larger_h or smaller_w > larger_w:
        aspect_ratio = smaller_w / smaller_h
        if smaller_h > larger_h:
            smaller_h = larger_h
            smaller_w = int(smaller_h * aspect_ratio)
        if smaller_w > larger_w:
            smaller_w = larger_w
            smaller_h = int(smaller_w / aspect_ratio)
        smaller_image = smaller_image.resize((smaller_w, smaller_h))
        smaller_image_np = np.array(smaller_image)

    # Determine position
    positions = {
        'center': ((larger_w - smaller_w) // 2, (larger_h - smaller_h) // 2),
        'top-left': (0, 0),
        'top-right': (larger_w - smaller_w, 0),
        'bottom-left': (0, larger_h - smaller_h),
        'bottom-right': (larger_w - smaller_w, larger_h - smaller_h)
    }

    if position not in positions:
        raise ValueError("Invalid position argument. Choose from 'center', 'top-left', 'top-right', 'bottom-left', 'bottom-right'")
    
    x_offset, y_offset = positions[position]

    # Blend images
    blended_image_np = larger_image_np.copy()
    for c in range(3):
        blended_image_np[y_offset:y_offset+smaller_h, x_offset:x_offset+smaller_w, c] = (
            alpha * smaller_image_np[:, :, c] +
            (1 - alpha) * larger_image_np[y_offset:y_offset+smaller_h, x_offset:x_offset+smaller_w, c]
        )

    # Convert blended image back to PIL Image
    blended_image = Image.fromarray(blended_image_np.astype('uint8'))

    # Save blended image to output directory
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'blended_image.png')
    blended_image.save(output_file)

    # Create JSON response
    json_data = {
        'output_path': output_file,
        'position': position,
        'alpha': alpha
    }

    # Write JSON to file
    json_output_file = os.path.join(output_dir, 'output.json')
    with open(json_output_file, 'w') as f:
        json.dump(json_data, f, indent=4)

    return json_output_file


In [5]:
llm_config2 = {"config_list": [{"model": "gpt-4", "api_key": openai_api_key}]}
code_execution_config = {"use_docker": False}

# Initialize the assistant agent with the given configurations
config_list = [
    {"model": "gpt-4", "api_key": openai_api_key, "api_type": "openai"},
]

In [6]:
llm_config={
        "temperature": 0,
        "timeout": 600,
        "cache_seed": 42,
        "config_list": config_list,
        "functions": [
             {
                        "name": "blend_images",
                        "description": "use this function to blend the images",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "position": {
                                    "type": "string",
                                    "description": "This is where you will position the blending"
                                },
                            },
                            "required": ["position"]
                        }
                        },
             {
                        "name": "read_img",
                        "description": "use this to read the image blended from blended image",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "output_json": {
                                    "type": "object",
                                    "description": "This is the blended image"
                                },
                            },
                            "required": ["position"]
                        }
                        }
                        ],
}


In [7]:
import os
import autogen

from autogen import ConversableAgent

# Let's first define the assistant agent that suggests tool calls.
img_blend_assistant = autogen.AssistantAgent(
    name="image_blending_assistant",
    code_execution_config=False,
    system_message="You are a helpful AI assistant. "
    "You can help with blending images. "
    "Return 'TERMINATE' when the task is done.",
    llm_config=llm_config,
    function_map={
        "blend_images": blend_images,
        "read_img": read_img
    }
        
)

img_critic_assistant = autogen.AssistantAgent(
    name="image_blending_assistant",
    code_execution_config=False,
    system_message="You are a critic AI assistant. "
    "You task is to critic the 'output.json' from 'img_blend_assistant' "
    "Return 'TERMINATE' when the task is done.",
    llm_config=llm_config,
    function_map={
        "blend_images": blend_images,
        "read_img": read_img
    }
        
)




# The user proxy agent is used for interacting with the assistant agent
# and executes tool calls.
def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=termination_msg,
    human_input_mode="NEVER",
    code_execution_config=False
    # # is_termination_msg=lambda x: "content" in x and x["content"] is not None and x["content"].rstrip().endswith("TERMINATE"),
    # # code_execution_config={"work_dir": "planning"},
    # function_map={"blend_images": blend_images},
    # code_execution_config=False
)



groupchat = autogen.GroupChat(
    agents=[user_proxy, img_blend_assistant, img_critic_assistant],
    messages=[],  # The initial messages in the chat
    max_round=10  # Maximum rounds of conversation
)

manager = autogen.GroupChatManager(
    groupchat=groupchat,
    llm_config=llm_config2
)



In [9]:
user_proxy.initiate_chat(
    manager, message="blend the images at bottom-right")

[33muser_proxy[0m (to chat_manager):

blend the images at bottom-right

--------------------------------------------------------------------------------
[32m
Next speaker: image_blending_assistant
[0m
[33mimage_blending_assistant[0m (to chat_manager):

[32m***** Suggested function call: blend_images *****[0m
Arguments: 
{
  "position": "bottom-right"
}
[32m*************************************************[0m

--------------------------------------------------------------------------------
[32m
Next speaker: image_blending_assistant
[0m
[35m
>>>>>>>> EXECUTING FUNCTION blend_images...[0m
[33mimage_blending_assistant[0m (to chat_manager):

[32m***** Response from calling function (blend_images) *****[0m
./output/output.json
[32m*********************************************************[0m

--------------------------------------------------------------------------------
[32m
Next speaker: image_blending_assistant
[0m
[33mimage_blending_assistant[0m (to chat_manager)

ChatResult(chat_id=None, chat_history=[{'content': 'blend the images at bottom-right', 'role': 'assistant'}, {'content': '', 'function_call': {'arguments': '{\n  "position": "bottom-right"\n}', 'name': 'blend_images'}, 'name': 'image_blending_assistant', 'role': 'assistant'}, {'content': './output/output.json', 'name': 'blend_images', 'role': 'function'}, {'content': '', 'function_call': {'arguments': '{}', 'name': 'read_img'}, 'name': 'image_blending_assistant', 'role': 'assistant'}, {'content': "Error: read_img() missing 1 required positional argument: 'json_path'", 'name': 'read_img', 'role': 'function'}, {'content': '', 'role': 'assistant'}, {'content': "I apologize for the confusion. It seems there was a misunderstanding in the process. Let's try again.", 'function_call': {'arguments': '{\n  "position": "bottom-right"\n}', 'name': 'blend_images'}, 'name': 'image_blending_assistant', 'role': 'assistant'}, {'content': './output/output.json', 'name': 'blend_images', 'role': 'function