In [None]:
pip install transformers einops torch

In [None]:
!wget https://www.cdc.gov/healthypets/images/pets/cute-dog-headshot.jpg

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch

model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
).to(device)
model = torch.compile(model)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

In [4]:
%%time
image = Image.open("cute-dog-headshot.jpg")
enc_image = model.encode_image(image)
print(model.answer_question(enc_image, "Describe this image.", tokenizer))

A small, light brown dog with a white chest and paws is sitting on a wooden porch, wearing a blue collar with a pink and white pattern. The dog's eyes are focused on the camera, and its head is tilted slightly to the left. The porch has a wooden floor and a railing in the background, creating a rustic and cozy atmosphere.
CPU times: user 4.55 s, sys: 289 ms, total: 4.84 s
Wall time: 5.98 s


In [None]:
!pip install Pillow==10.1.0 timm==0.9.10 torch==2.1.2 torchvision==0.16.2 transformers==4.36.0 sentencepiece==0.1.99

In [None]:
# test.py
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True, torch_dtype=torch.bfloat16)
# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
# model = model.to(device='cuda', dtype=torch.bfloat16)
# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
model = model.to(device='cuda', dtype=torch.float16)
# For Mac with MPS (Apple silicon or AMD GPUs).
# Run with `PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py`
#model = model.to(device='mps', dtype=torch.float16)

In [10]:
%%time
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
model.eval()

image = Image.open('cute-dog-headshot.jpg').convert('RGB')
question = 'Describe this image.'
msgs = [{'role': 'user', 'content': question}]

res, context, _ = model.chat(
    image=image,
    msgs=msgs,
    context=None,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.7
)
print(res)


In the image, a small brown and white dog is captured in an intimate moment. The canine's gaze is directed upwards towards something off-frame to its right side, creating a sense of curiosity or anticipation. 
The dog stands on what appears to be a wooden floor with visible planks beneath it, adding a touch of rustic charm to the scene. A blue collar adorned with pink flowers adds a pop of color against the warm tones of the dog's fur. This accessory suggests that this might not just be any ordinary day for our furry friend but rather one filled with joyous moments like these.
CPU times: user 9.25 s, sys: 14.5 ms, total: 9.27 s
Wall time: 11.1 s
