# Introduction

    This demonstration seeks to employ vLLM in generating a textual description of an image, 
    leveraging a locally deployed instance of vLLM via Ollama.

In [2]:
import os
import pandas as pd           
from pathlib import Path
import ollama
import time
import torch
import gc
import base64
# Check for GPU driver
print(f"GPU check >>> {torch.cuda.is_available()}")  # Should print True
print(f"GPU : {torch.cuda.get_device_name(0)}")  # Prints GPU name

GPU check >>> True
GPU : NVIDIA GeForce RTX 5080


In [3]:
def img_describe(image):
    model = "gemma3:4b"
    print(f"description of {image} ")
    
    # Read and encode the image
    with open(image, "rb") as image_file:
        image_base64 = base64.b64encode(image_file.read()).decode('utf-8')

    """
    A most meticulously crafted prompt, such that the language model might produce an output that's perfectly adequate – 
    nay, perhaps even rather decent.
    """
    detail_prompt = """
            You are an expert visual analyst with exceptional attention to detail. 
            Analyze the provided image thoroughly and provide a highly detailed, structured description of everything you observe.
            Organize your response into the following sections:
                
                1.Overall Scene and Composition: 
                    Describe the main subject, setting, layout, perspective, and overall mood or atmosphere.
                
                2.Key Objects and Elements: 
                    List and describe all prominent objects, people, animals, or features, including their positions, sizes, 
                    shapes, and any notable details.
                
                3.Colors, Lighting, and Style: 
                    Detail the color palette, lighting conditions (e.g., natural, artificial, shadows, highlights), textures, 
                    and any artistic or photographic style (e.g., realistic, abstract, vintage).
                
                4.Text and Symbols: 
                    Transcribe any visible text exactly (including fonts, sizes, and positions), and note any 
                    logos, signs, or symbols.
                
                5.Fine Details and Anomalies: 
                    Point out subtle elements, potential emotions expressed, implied actions, or anything unusual/out of place.
                
                6.Interpretation and Context: Offer possible interpretations of what the image depicts, its purpose 
                (e.g., advertisement, meme, artwork), and any cultural or emotional implications.
                
                
            Be objective, precise, and exhaustive—aim for maximum detail without speculation unless clearly marked as such. 
            If the image contains multiple views or panels, analyze each separately.
            """
        
        
    
    try:   
        # Generate a chat
        response = ollama.chat(
        model = model,
        messages=[{
                'role': 'user',
                'content': detail_prompt,
                'images': [image_base64]
        }]
        )
        
        print(response['message']['content'])
        print("-------------------------------------------------------------------")      
        
        #-------------------Save the output response-------------------
        output_file =  image.replace(".jpg", "") +"_"+ model + "_describe.txt" 
        file = open(output_file, "w")
        file.write(response['message']['content'])
        file.close()
        
        
    except ollama.ResponseError:
            print("Error: " + model + " cannot handle this format")
    # Clean up
    del response
    torch.cuda.empty_cache()  # Clear GPU memory if using CUDAs

In [4]:
image_extensions = (".jpg" , "webp")

lower_bound = 0
upper_bound = 10

while lower_bound < upper_bound:
    image_files = []
    folder_path = str(lower_bound)
    for file in os.listdir(folder_path):
        # Check if file has an image extension
        if file.lower().endswith(image_extensions):
            image_files.append(file)

    image_files.sort()
    
    i = 0
    for i in range(len(image_files)):
        image = folder_path + "/" + image_files[i]
        # Call the describe function
        img_describe(image)
    
    lower_bound = lower_bound + 1

description of 0/2025-12-12_05-57-55_UTC.jpg 
Okay, here's a highly detailed analysis of the image, broken down into the requested sections.

**1. Overall Scene and Composition:**

* **Main Subject:** The image depicts a single, stylized hand holding a bright yellow, smooth, rounded object, likely a ball or sphere.
* **Setting:** The setting is a stark, minimalist white background, which isolates the hand and object. It evokes a sense of sterile purity or a studio setting.
* **Layout:** The composition is centered, placing the hand and object directly in the middle of the frame.  This symmetrical arrangement contributes to a sense of balance and simplicity. 
* **Perspective:** The image appears to be a straightforward, frontal shot with no perspective distortion.
* **Mood/Atmosphere:** The overall mood is clean, smooth, and slightly surreal due to the isolated subject and plain background. It feels artificial, almost like a product shot or a conceptual image. There’s a sense of quiet, 