# Introduction

    This demonstration seeks to employ vLLM in generating a textual description of an image, 
    leveraging a locally deployed instance of vLLM via Ollama.

In [7]:
import os
import pandas as pd           
from pathlib import Path
import ollama
import time
import torch
import gc
import base64
# Check for GPU driver
print(f"GPU check >>> {torch.cuda.is_available()}")  # Should print True
print(f"GPU : {torch.cuda.get_device_name(0)}")  # Prints GPU name

GPU check >>> True
GPU : NVIDIA GeForce RTX 5080


In [4]:
detail_prompt = """
            You are an expert visual analyst with exceptional attention to detail. 
            Analyze the provided image thoroughly and provide a highly detailed, structured description of everything you observe.
            Organize your response into the following sections:
                
                1.Overall Scene and Composition: 
                    Describe the main subject, setting, layout, perspective, and overall mood or atmosphere.
                
                2.Key Objects and Elements: 
                    List and describe all prominent objects, people, animals, or features, including their positions, sizes, 
                    shapes, and any notable details.
                
                3.Colors, Lighting, and Style: 
                    Detail the color palette, lighting conditions (e.g., natural, artificial, shadows, highlights), textures, 
                    and any artistic or photographic style (e.g., realistic, abstract, vintage).
                
                4.Text and Symbols: 
                    Transcribe any visible text exactly (including fonts, sizes, and positions), and note any 
                    logos, signs, or symbols.
                
                5.Fine Details and Anomalies: 
                    Point out subtle elements, potential emotions expressed, implied actions, or anything unusual/out of place.
                
                6.Interpretation and Context: Offer possible interpretations of what the image depicts, its purpose 
                (e.g., advertisement, meme, artwork), and any cultural or emotional implications.
                
                
            Be objective, precise, and exhaustive—aim for maximum detail without speculation unless clearly marked as such. 
            If the image contains multiple views or panels, analyze each separately.
            """

In [8]:
def img_describe(image):
    model = "gemma3:12b"
    print(f"description of {image} ")
    
   

    """
    A most meticulously crafted prompt, such that the language model might produce an output that's perfectly adequate – 
    nay, perhaps even rather decent.
    """
    detail_prompt = "describe the image in details"
            
        
    
    try:   
        # Generate a chat
        response = ollama.chat(
        model = model,
        messages=[{
                'role': 'user',
                'content': detail_prompt,
                'images': [image]
        }]
        )
        
        print(response['message']['content'])
        print("-------------------------------------------------------------------")      
        
        #-------------------Save the output response-------------------
        output_file =  image.replace(".jpg", "") +"_"+ model + "_describe.txt" 
        file = open(output_file, "w")
        file.write(response['message']['content'])
        file.close()
        
        
    except ollama.ResponseError:
            print("Error: " + model + " cannot handle this format")
    # Clean up
    del response
    torch.cuda.empty_cache()  # Clear GPU memory if using CUDAs

In [9]:
image_extensions = (".jpg" , "webp")

lower_bound = 0
upper_bound = 10

while lower_bound < upper_bound:
    image_files = []
    folder_path = str(lower_bound)
    for file in os.listdir(folder_path):
        # Check if file has an image extension
        if file.lower().endswith(image_extensions):
            image_files.append(file)

    image_files.sort()
    
    i = 0
    for i in range(len(image_files)):
        image = folder_path + "/" + image_files[i]
        # Call the describe function
        img_describe(image)
    
    lower_bound = lower_bound + 1

description of 0/2025-12-12_05-57-55_UTC.jpg 
Here's a detailed description of the image:

**Overall Impression:**

The image is an advertisement, likely for a mask promotion in Hong Kong. It's visually striking with a bold, dynamic layout and a focus on visual appeal. The background is a gradient of blues, creating a modern and eye-catching aesthetic.

**Visual Elements:**

*   **Masks:** There are four surgical masks prominently displayed. They are stacked slightly askew, giving a sense of motion and casualness. Each mask is white with a green pattern and the words "HONGKONG MASK" printed on them.
*   **Price and Quantity:**
    *   A large, stylized "S10" symbol is displayed on the left side.
    *   The word "盒" (box) is printed next to the "S10".
    *   "每天限量60份" (60 units limited daily) is written underneath the "S10"
*   **Redemption Code and Dates:**
    *   "限位號碼: 5A35" (redemption code) is written on the bottom-left.
    *   "12月13日至1月5日" (dates December 13 to January 5) is 