# Captioning Images

This notebook processes a JSON file of images, generates captions for each image, and computes embeddings for those captions using Azure OpenAI services.

In [None]:
# Cell 1: Import necessary libraries
import os  
import json  
from openai import AzureOpenAI
from dotenv import load_dotenv  
import pandas as pd
import time
import traceback
import base64  
from mimetypes import guess_type  

load_dotenv(override=True)

## Configuration

Set up the Azure OpenAI parameters.

In [None]:
# Cell 2: Set up the AzureOpenAI client
api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
api_key = os.getenv("AZURE_OPENAI_API_KEY")  
deployment_name = 'trygpt4o'  
api_version = '2024-02-01'  

client = AzureOpenAI(  
    api_key=api_key,  
    api_version=api_version,  
    base_url=f"{api_base}/openai/deployments/{deployment_name}"  
)

embedding_client = AzureOpenAI(
    api_key=os.getenv("EMBEDDING_OPENAI_API_KEY"),
    api_version=os.getenv("EMBEDDING_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("EMBEDDING_OPENAI_API_ENDPOINT")
)

## Helper Functions

We need some helper functions to handle image processing and data formatting.

In [None]:
# Cell 3: Define functions to convert local image to data url and process data
def local_image_to_data_url(image_path):  
    mime_type, _ = guess_type(image_path)  
    if mime_type is None:  
        mime_type = 'application/octet-stream'  
    with open(image_path, "rb") as image_file:  
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')  
    return f"data:{mime_type};base64,{base64_encoded_data}"  

def process_data(data, book_name):  
    user_prompt = "Find the associate caption for the cropped image from the whole_page image. If you cannot find the caption, make up a caption yourself by providing a detailed description of the image. Your response must be only the caption itself, do not respond anything other than this."
    page_image_count = {}
    for count, item in enumerate(data):
        if data[count]['page'] not in page_image_count:
            page_image_count[data[count]['page']] = 1
        else:
            page_image_count[data[count]['page']] += 1
        data[count]['id'] = f"{data[count]['page']}_{page_image_count[data[count]['page']]}"
        
        messages = []  
        user_content = []  
        system_prompt = "You are a helpful AI assistant."
        messages.append({"role": "system", "content": system_prompt})
        
        wholepage_url = local_image_to_data_url(book_name + "_wholepageimage/" + data[count]['image'].split("_img_")[0] + ".png")  
        user_content.append({"type": "image_url", "image_url": {"url": wholepage_url , "detail": "high"}})  
        user_content.append({"type": "text", "text": "This is whole_page image\n"})  
        
        crop_url = local_image_to_data_url(book_name + "_cropimage/" + data[count]['image'])  
        user_content.append({"type": "image_url", "image_url": {"url": crop_url , "detail": "low"}})  
        user_content.append({"type": "text", "text": "This is crop_image " + user_prompt})  
        messages.append({"role": "user", "content": user_content})  
        
        retry_count = 0
        while retry_count < 2:
            try:
                response = client.chat.completions.create(
                    model=deployment_name,
                    messages=messages,
                    max_tokens=2000
                )
                response_text = response.choices[0].message.content
                data[count]['caption'] = response_text
                print(data[count], "\n")
                break
            except (openai.BadRequestError, openai.InternalServerError) as e:
                retry_count += 1
                if retry_count == 2:
                    print(f"Error getting caption for {data[count]['image']} after retry\n")
                    traceback.print_exc()
                else:
                    print(f"Retrying to get caption for {data[count]['image']}\n")
                    time.sleep(1)        
    return data

## Load and Process Data

Let's load the input data and process it.

In [None]:
# Cell 4: Process the data
input_file = "output/demofile.json"  # replace with your actual file path
output_file = "imagecaption.json"  # replace with your actual file path

with open(input_file, 'r') as f:
    data = json.load(f)

data = process_data(data, input_file.split(".json")[0])

with open(output_file, 'w') as f:
    json.dump(data, f, indent=4)

print(f"Processed data saved to {output_file}.")