# Demo

In [None]:
# from inference import SimpleInference

# # Initialize model
# engine = SimpleInference()

# # Generate description
# engine.generate(
#     image_path="examples/city.jpg",
#     prompt="describe",
#     max_tokens=1024,
#     temperature=0.8,
#     top_p = 0.9
# )



# Step 1: Prepare the image and prompt

## Load the image

In [None]:
from PIL import Image

image = Image.open("examples/panda.jpg")
image


## Resize the image to 224x224

In [None]:
from PIL import Image
import numpy as np
from typing import List, Tuple, Optional, Union, Iterable, Dict

def rescale(
    image: np.ndarray, scale: float, dtype: np.dtype = np.float32
) -> np.ndarray:
    rescaled_image = image * scale
    rescaled_image = rescaled_image.astype(dtype)
    return rescaled_image


def resize(
    image: Image,
    size: Tuple[int, int],
    resample: Image.Resampling = None,
    reducing_gap: Optional[int] = None,
) -> np.ndarray:
    height, width = size
    resized_image = image.resize(
        (width, height), resample=resample, reducing_gap=reducing_gap
    )
    return resized_image


def normalize(
    image: np.ndarray,
    mean: Union[float, Iterable[float]],
    std: Union[float, Iterable[float]],
) -> np.ndarray:
    mean = np.array(mean, dtype=image.dtype)
    std = np.array(std, dtype=image.dtype)
    image = (image - mean) / std
    return image


def process_images(
    images: List[Image.Image],
    size: Dict[str, int] = None,
    resample: Image.Resampling = None,
    rescale_factor: float = None,
    image_mean: Optional[Union[float, List[float]]] = None,
    image_std: Optional[Union[float, List[float]]] = None,
) -> List[np.ndarray]:
    height, width = size[0], size[1]
    images = [
        resize(image=image, size=(height, width), resample=resample) for image in images
    ]
    images = [np.array(image) for image in images]
    # Rescale the pixel values to be in the range [0, 1]
    images = [rescale(image, scale=rescale_factor) for image in images]
    # Normalize the images to have mean 0 and standard deviation 1
    images = [normalize(image, mean=image_mean, std=image_std) for image in images]
    # Move the channel dimension to the first dimension as the model expects images in the format [Channel, Height, Width]
    images = [image.transpose(2, 0, 1) for image in images]

    return images


In [None]:
from PIL import Image
import numpy as np
import torch

IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]  # From HF code
IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]  # From HF code

image = Image.open("examples/home.jpg")

pixel_values = process_images(
    [image],
    size=(224, 224),
    resample=Image.Resampling.BICUBIC,
    rescale_factor=1 / 255.0,
    image_mean=IMAGENET_STANDARD_MEAN,
    image_std=IMAGENET_STANDARD_STD,
)

pixel_values = np.stack(pixel_values, axis=0)
pixel_values = torch.tensor(pixel_values)

print(pixel_values.shape)


In [None]:
import torch
import torchvision.transforms as T
from PIL import Image
import numpy as np

def preprocess_single_image(image, size=(224, 224), mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]):
    # 支持输入：PIL.Image 或 str(路径)
    if isinstance(image, str):
        image = Image.open(image).convert('RGB')
    elif isinstance(image, np.ndarray):
        image = Image.fromarray(image).convert('RGB')
    elif not isinstance(image, Image.Image):
        raise ValueError("Unsupported image type")
    
    # 调整大小
    transform = T.Compose([
        T.Resize(size),
        T.ToTensor(),  # 转为 [C, H, W] Tensor
        T.Normalize(mean=mean, std=std)
    ])
    tensor = transform(image)
    
    # 添加批维度: [C, H, W] -> [1, C, H, W]
    return tensor.unsqueeze(0)

# 使用
result = preprocess_single_image(image)  # torch.Size([1, 3, 224, 224])
result.shape

## Prompt

In [None]:
prompt = "describe the image"

In [None]:
def add_image_tokens_to_prompt(prefix_prompt, bos_token, image_seq_len, image_token):
    #   The input text is tokenized normally.
    #   A <bos> token is added at the beginning, and an additional newline token (\n) is appended.
    #   This newline token is an essential part of the input prompt the model was trained with, so adding it explicitly ensures it's always there.
    #   The tokenized text is also prefixed with a fixed number of <image> tokens.
    #   Unlike in the PaliGemma paper, the Hugging Face code doesn't tokenize \n separately.
    return f"{image_token * image_seq_len}{bos_token}{prefix_prompt}\n"


In [None]:
input_strings = [
            add_image_tokens_to_prompt(
                prefix_prompt=prompt,
                bos_token=self.tokenizer.bos_token,
                image_seq_len=self.image_seq_length,
                image_token=self.IMAGE_TOKEN,
            )
            for prompt in text
        ]