 # Image categoriser
 
 A proof of concept for a zero-shot photograph categoriser using OpenAI's gpt-4-vision-preview VLM

In [None]:
# import at the library level to avoid function name confusion between libraries for simplicity
import datetime
import fnmatch
import functools
import ollama
import os
import pandas as pd
import typing

In [None]:
# Define our requirements for classification
folder_to_classify = "D:/pictures"
style_categories = ["Photo", "Art", "Diagram", "Other"]
subject_categories = ["Landscape", "Urban", "Portrait", "Pets", "Wildlife", "Other"]

In [None]:
start_time = datetime.datetime.now()

In [None]:
def image_files_generator(directory: str) -> typing.Generator[str, None , None]:
    """
    Recursively finds all image files in the specified directory and its subdirectories.

    Args:
        directory (str): The path to the directory to search in.

    Returns:
        Generator[str]: paths to image files found yielded one at a time
    """
    image_extensions = ['*.jpg', '*.jpeg', '*.png']#, '*.gif', '*.bmp', '*.tiff']

    for root, dirs, files in os.walk(directory):
        for extension in image_extensions:
            for filename in fnmatch.filter(files, extension):
                file_path = os.path.join(root, filename)
                yield file_path.replace("\\", "/")

In [None]:
image_list = list(image_files_generator(directory = folder_to_classify))

In [None]:
def decode_response(response: str) -> tuple[str, str, str]:
    """
    Decodes the response from a request.

    This function takes a response from a request, extracts the content, 
    splits it into style, category, and description, and returns these as a tuple. 
    If the content does not contain all three elements, it makes assumptions 
    about the missing elements.

    Args:
        response (str): The response from a request to the VLM model

    Returns:
        Tuple[str, str, str]: A tuple containing the style, category, and description.
    """
    contents = response.split('|')
    style = contents[0]
    try:
        category = contents[1]
    except IndexError:
        pass
    try:
        description = contents[2]
    except IndexError:
        # if we are missing indices we cannot assume anything is correct so place what we do have in the description
        description = contents[0]
        style = "unknown"
        category = "unknown"
    return style, category, description

In [None]:
def categorise(
    image_path: str,
    style_categories: list[str],
    subject_categories: list[str]
) -> str:
    """
    Categorize an image based on style, subject, and description.

    This function takes an image file path, permissible style categories,
    and permissible subject categories. It sends the image to a local Ollama endpoint running LLaVA for analysis and
    returns a pipe-separated string containing the most appropriate style, subject, and description.
    
    Args:
        image_path (str): The file path to the image.
        param style_categories (List[str]): List of permissible style categories.
        param subject_categories (List[str]):  List of permissible subject categories.
    
    Returns: 
        Tuple[str, str, str]: A pipe-separated string in the format "Style|Subject|Description".
    """

    # compile the query to go with the image
    text = f"""Please categorise this image.
To do this you will return a pipe separated string of the form: 
Style|Subject|Description 
Permissible styles are: {", ".join(style_categories)} . Pick only the most appropriate one.
Permissible subjects are: {", ".join(subject_categories)} . Pick only the most appropriate one.
The description should be no more than 30 words long and should describe the picture as accurately as possible
Return no other detail other than the 3 pipe separated items so for instance if the style was {style_categories[0]}
and the subject was {subject_categories[0]} and the desciption was 'A small apple in a green bowl' you would return:
{style_categories[0]}|{subject_categories[0]}|A small apple in a green bowl
"""

    response = ollama.chat(
        model="llava",
        messages=[
            {
                'role': 'user',
                'content': text,
                'images': [image_path]
            }
        ]
    )
    
    style, category, description = decode_response(response['message']['content'])

    return style, category, description 

partial_categorise = functools.partial(
    categorise,
    style_categories = style_categories,
    subject_categories = subject_categories,
)

In [None]:
def categorise_image_files(image_files:list[str]) -> typing.Generator[dict[str], None , None]:
    """
    Categorises a list of image files.

    This function takes a list of image file paths, categorises each image, 
    and returns a list of dictionaries. Each dictionary contains the file path 
    and its corresponding categories.

    Args:
        image_files (List[str]): A list of image file paths.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries. Each dictionary contains 
        'file' (the file path) and 'categories' (the categories of the image).
    """
    for image_file in image_files:
        style, category, description = partial_categorise(image_file)
        category_dict = {
            'file': image_file,
            'style': style,
            'category': category,
            'description': description,
        }
        yield category_dict

In [None]:
categorised_pictures = list(categorise_image_files(image_list))

In [None]:
df_pictures = pd.DataFrame(categorised_pictures)
display(df_pictures)

In [None]:
# write results out to durable storage
df_pictures.to_excel(folder_to_classify + "/llava_picture_classification.xlsx")

In [None]:
end_time = datetime.datetime.now()
total_time = end_time - start_time
print(total_time)