In [1]:
import transformers

In [2]:
print("Transformers version:", transformers.__version__)

Transformers version: 4.49.0


## Text-Image to Text generation Pipeline

In [3]:
from transformers import pipeline

pipe = pipeline(
    task="image-text-to-text", model="Salesforce/blip-image-captioning-base"
)
pipe(
    "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    text="A photo of",
)

2025-04-05 15:37:34.512722: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743860254.590014    5845 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743860254.612333    5845 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 15:37:34.806056: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu
Keyword argument `legacy` is not a valid argument for this processor and will be ignored.


[{'input_text': 'A photo of', 'generated_text': 'A photo of two birds'}]

In [9]:
pipe(
    "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    text="The name of the animal in the image is",
)

[{'input_text': 'The name of the animal in the image is',
  'generated_text': 'The name of the animal in the image is black and white'}]

In [10]:
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Give the animal type in the image"},
        ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "It is a"},
        ],
    },
]
pipe(text=messages, max_new_tokens=20, return_full_text=False)

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


[{'input_text': [{'role': 'user',
    'content': [{'type': 'image',
      'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
     {'type': 'text', 'text': 'Give the animal type in the image'}]},
   {'role': 'assistant', 'content': [{'type': 'text', 'text': 'It is a'}]}],
  'generated_text': ' dog.'}]

## Image Classification
```python

In [3]:
from transformers import pipeline

classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")

Device set to use cpu


KeyError: 9205

In [2]:
from transformers import AutoModelForImageClassification
from PIL import Image
from timm.data.transforms_factory import create_transform
import requests

model = AutoModelForImageClassification.from_pretrained(
    "nvidia/MambaVision-S-1K", trust_remote_code=True
)

# eval mode for inference
model.cuda().eval()

# prepare image for the model
url = "http://images.cocodataset.org/val2017/000000020247.jpg"
image = Image.open(requests.get(url, stream=True).raw)
input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(
    input_size=input_resolution,
    is_training=False,
    mean=model.config.mean,
    std=model.config.std,
    crop_mode=model.config.crop_mode,
    crop_pct=model.config.crop_pct,
)

inputs = transform(image).unsqueeze(0).cuda()
# model inference
outputs = model(inputs)
logits = outputs["logits"]
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])


Predicted class: brown bear, bruin, Ursus arctos


## Zero-Shot Image Classification

In [2]:
from transformers import pipeline

classifier = pipeline(model="google/siglip-so400m-patch14-384")
print(
    classifier(
        "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
        candidate_labels=["animals", "humans", "landscape"],
    )
)

classifier(
    "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    candidate_labels=["black and white", "photorealist", "painting"],
)

Device set to use cpu


[{'score': 0.006197041366249323, 'label': 'animals'}, {'score': 2.4825822038110346e-05, 'label': 'humans'}, {'score': 1.657347456784919e-05, 'label': 'landscape'}]


[{'score': 0.2144334614276886, 'label': 'black and white'},
 {'score': 0.0009214494493789971, 'label': 'photorealist'},
 {'score': 6.276291060203221e-07, 'label': 'painting'}]

## Text-Classification Pipeline (sentiment analysis)

In [12]:
classifier = pipeline(
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)
print(classifier("Ya bro this fuckinz good !"))

classifier("Director tried too much.")

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9997861981391907}]


[{'label': 'NEGATIVE', 'score': 0.9963769316673279}]

## Token Classification Pipeline (NER)

In [4]:
from transformers import pipeline

token_classifier = pipeline(
    model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple"
)
sentence = "Je m'appelle jean-baptiste et je vis à montréal"
tokens = token_classifier(sentence)
tokens

token = tokens[0]
# Start and end provide an easy way to highlight words in the original text.
sentence[token["start"] : token["end"]]

# Some models use the same idea to do part of speech.
syntaxer = pipeline(
    model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple"
)
syntaxer("My name is Sarah and I live in London")

Device set to use cpu


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[{'entity_group': 'PRON',
  'score': np.float32(0.9994574),
  'word': 'my',
  'start': 0,
  'end': 2},
 {'entity_group': 'NOUN',
  'score': np.float32(0.99672616),
  'word': 'name',
  'start': 3,
  'end': 7},
 {'entity_group': 'AUX',
  'score': np.float32(0.9942233),
  'word': 'is',
  'start': 8,
  'end': 10},
 {'entity_group': 'PROPN',
  'score': np.float32(0.99876463),
  'word': 'sarah',
  'start': 11,
  'end': 16},
 {'entity_group': 'CCONJ',
  'score': np.float32(0.9991027),
  'word': 'and',
  'start': 17,
  'end': 20},
 {'entity_group': 'PRON',
  'score': np.float32(0.99949694),
  'word': 'i',
  'start': 21,
  'end': 22},
 {'entity_group': 'VERB',
  'score': np.float32(0.9981451),
  'word': 'live',
  'start': 23,
  'end': 27},
 {'entity_group': 'ADP',
  'score': np.float32(0.9994154),
  'word': 'in',
  'start': 28,
  'end': 30},
 {'entity_group': 'PROPN',
  'score': np.float32(0.9986274),
  'word': 'london',
  'start': 31,
  'end': 37}]

## Table Question Answering Pipeline

In [2]:
from transformers import pipeline

oracle = pipeline(model="google/tapas-base-finetuned-wtq")
table = {
    "Repository": ["Transformers", "Datasets", "Tokenizers"],
    "Stars": ["36542", "4512", "3934"],
    "Contributors": ["651", "77", "34"],
    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
oracle(query="How many stars does the transformers repository have?", table=table)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/262k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Device set to use cpu
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


{'answer': 'AVERAGE > 36542',
 'coordinates': [(0, 1)],
 'cells': ['36542'],
 'aggregator': 'AVERAGE'}

## Image Segmentation Pipeline

In [2]:
from transformers import pipeline

segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
segments = segmenter(
    "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"
)
len(segments)

print(segments[0]["label"])

print(segments[1]["label"])

type(
    segments[0]["mask"]
)  # This is a black and white mask showing where is the bird on the original image.

segments[0]["mask"].size

Some weights of the model checkpoint at facebook/detr-resnet-50-panoptic were not used when initializing DetrForSegmentation: ['detr.model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


bird
bird


(768, 512)