In [2]:
%pip install -U transformers \
             datasets==2.14.4 \
             diffusers==0.20.0 \
             accelerate==0.21.0 \
             torch==2.0.1 \
             torchvision==0.15.2 \
             sentencepiece==0.1.99

Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m698.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets==2.14.4
  Downloading datasets-2.14.4-py3-none-any.whl.metadata (19 kB)
Collecting diffusers==0.20.0
  Downloading diffusers-0.20.0-py3-none-any.whl.metadata (17 kB)
Collecting accelerate==0.21.0
  Using cached accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Collecting torch==2.0.1
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchvision==0.15.2
  Downloading torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (11 kB)
Collecting sentencepiece==0.1.99
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.4)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Coll

In [None]:
import torch
from transformers import IdeficsForVisionText2Text, AutoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "HuggingFaceM4/idefics-9b-instruct"
model = IdeficsForVisionText2Text.from_pretrained(model_name, 
                                                  torch_dtype=torch.bfloat16).to(device)
processor = AutoProcessor.from_pretrained(model_name)

# Generation args
exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids

Downloading config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/99.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/7.89G [00:00<?, ?B/s]

# Zero-shot inference

![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [None]:
url = "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg"
img = processor.image_processor.fetch_images([url])[0]

prompts = [
    "\nUser:",
    img,
    "Describe this image.\nAssistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

# One-shot Inference (1 example) to guide the description

![](https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg)
![](https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg)

In [None]:
url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
img = processor.image_processor.fetch_images([url])[0]

# Either use img or url
prompts = [
    "User:",
    img,
    "Describe this image."
    "Assistant: An image of two kittens in grass." # One-shot example
    "User:",
    "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg", 
    "Describe this image.",
    "Assistant: "
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

# Show special characters injected around the images

In [None]:
generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

# Ask Questions About Text in the Image

![](img/happy-car-chris.png)

In [None]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Describe this image.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

In [None]:
from PIL import Image
img = Image.open("img/happy-car-chris.png") 

prompts = [
    "User: ",
    img,
    "Who makes this car?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt", debug=True).to(device)

generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
generated_text

![](img/baby-groot-toy.jpg)

In [None]:
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
    "Which movie is this character from?",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) # eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

# Chain of thought

![](img/baby-groot-toy.jpg)

In [None]:
# This image is from https://www.amazon.com/Hot-Toys-Marvel-Guardians-Life-Size/dp/B07257N92P
img = Image.open("img/baby-groot-toy.jpg") 

prompts = [
    "User: ",
    img,
#    "Who produced the movie that features this character?",
    "Who produced the movie that features this character? Think step-by-step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=100) #, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

![](img/margherita-pizza.jpg)

In [None]:
# This image is from https://eu.ooni.com/blogs/recipes/margherita-pizza

img = Image.open("img/margherita-pizza.jpg") 

prompts = [
    "User: ",
    img,
    "How do I make this? Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

![](img/nflx-5-year-stock-chart.png)

In [None]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "Describe this image. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

# Not yet good at charts

In [None]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "What is the maxmium stock price as shown in this chart. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

# Not yet good at charts

In [None]:
img = Image.open("img/nflx-5-year-stock-chart.png") 

prompts = [
    "User: ",
    img,
    "What is the current stock price as shown in this chart. Think step by step.",
    "Assistant: ",
]

inputs = processor(prompts, return_tensors="pt").to(device)

generated_ids = model.generate(**inputs, max_length=1000, eos_token_id=exit_condition, bad_words_ids=bad_words_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)