# Finetuning an Image Prediction LLM using GPT-4O

In this tutorial, we will finetune an image prediction LLM using GPT-4O to determine if an image is funny and output a JSON with explanation and funny status.

In [None]:
import os
import json
import textgrad as tg
from textgrad.engine.openai import ChatOpenAI
from textgrad.utils.image_utils import download_and_cache_image
from textgrad.variable import Variable
from textgrad.loss import ImageQALoss
from textgrad.optimizer import TGD
from textgrad.tasks import load_task
from textgrad import BlackboxLLM
from textgrad.config import set_backward_engine
from textgrad.autograd import MultimodalLLMCall
from textgrad.tasks.multimodal import load_multimodal_instance_task


## Load a batch of local images with labels for training and validation

In [None]:
# Define the directory containing the images
image_dir = "path/to/your/image/directory"

# Load the images and their labels
images = []
labels = []
for filename in os.listdir(image_dir):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(image_dir, filename)
        label = "funny" if "funny" in filename else "not funny"
        images.append(image_path)
        labels.append(label)

# Split the data into training and validation sets
train_images = images[:int(0.8 * len(images))]
train_labels = labels[:int(0.8 * len(labels))]
val_images = images[int(0.8 * len(images)):]
val_labels = labels[int(0.8 * len(labels)):]

## Use GPT-4O to output a JSON with explanation and funny status

In [None]:
# Set the GPT-4O engine
set_backward_engine("gpt-4o")
engine = ChatOpenAI(model_string="gpt-4o", cache=True)

# Define the system prompt
system_prompt = Variable("You are an assistant that determines if an image is funny and provides an explanation.", requires_grad=False, role_description="system prompt for the assistant")

# Define the loss function
evaluation_instruction = "Please evaluate the existing answer to the visual question without solving it yourself. Verify that the answer provides accurate reasoning logic to address the question."
eval_fn = ImageQALoss(evaluation_instruction=evaluation_instruction, engine=engine)

# Define the optimizer
optimizer = TGD(parameters=[system_prompt])

# Finetune the model
for epoch in range(10):
    for image_path, label in zip(train_images, train_labels):
        image_data = download_and_cache_image(image_path)
        question = Variable("Is this image funny?", requires_grad=False, role_description="question to the assistant")
        response = MultimodalLLMCall(engine=engine, system_prompt=system_prompt)([Variable(image_data, role_description="image input"), question])
        loss = eval_fn(image=Variable(image_data, role_description="image input"), question=question, response=response)
        loss.backward()
        optimizer.step()
        system_prompt.reset_gradients()
    print(f"Epoch {epoch + 1} completed.")

## Validate the model

In [None]:
# Validate the model
correct = 0
total = 0
for image_path, label in zip(val_images, val_labels):
    image_data = download_and_cache_image(image_path)
    question = Variable("Is this image funny?", requires_grad=False, role_description="question to the assistant")
    response = MultimodalLLMCall(engine=engine, system_prompt=system_prompt)([Variable(image_data, role_description="image input"), question])
    response_json = json.loads(response.value)
    if (response_json["funny"] and label == "funny") or (not response_json["funny"] and label == "not funny"):
        correct += 1
    total += 1
accuracy = correct / total
print(f"Validation accuracy: {accuracy * 100:.2f}%")