https://huggingface.co/docs/transformers/main/model_doc/perceiver

In [9]:
import torch
from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverImageProcessor, PerceiverModel
from transformers.models.perceiver.modeling_perceiver import (
    PerceiverTextPreprocessor,
    PerceiverImagePreprocessor,
    PerceiverClassificationDecoder,
)
from PIL import Image
import requests

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# EXAMPLE 1: using the Perceiver to classify texts
# - we define a TextPreprocessor, which can be used to embed tokens
# - we define a ClassificationDecoder, which can be used to decode the
# final hidden states of the latents to classification logits
# using trainable position embeddings

In [4]:
version = "deepmind/language-perceiver"

# PerceiverConfig

In [11]:
config = PerceiverConfig()
config

PerceiverConfig {
  "attention_probs_dropout_prob": 0.1,
  "audio_samples_per_frame": 1920,
  "cross_attention_shape_for_attention": "kv",
  "cross_attention_widening_factor": 1,
  "d_latents": 1280,
  "d_model": 768,
  "hidden_act": "gelu",
  "image_size": 56,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "perceiver",
  "num_blocks": 1,
  "num_cross_attention_heads": 8,
  "num_frames": 16,
  "num_latents": 256,
  "num_self_attends_per_block": 26,
  "num_self_attention_heads": 8,
  "output_shape": [
    1,
    16,
    224,
    224
  ],
  "qk_channels": null,
  "samples_per_patch": 16,
  "self_attention_widening_factor": 1,
  "train_size": [
    368,
    496
  ],
  "transformers_version": "4.31.0",
  "use_query_residual": true,
  "v_channels": null,
  "vocab_size": 262
}

# PerceiverTextPreprocessor

In [18]:
preprocessor: PerceiverTextPreprocessor = PerceiverTextPreprocessor(config)
preprocessor

PerceiverTextPreprocessor(
  (embeddings): Embedding(262, 768)
  (position_embeddings): Embedding(2048, 768)
)

# PerceiverClassificationDecoder

In [16]:
decoder = PerceiverClassificationDecoder(
    config,
    num_channels=config.d_latents,
    trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
    use_query_residual=True,
)
decoder

PerceiverClassificationDecoder(
  (decoder): PerceiverBasicDecoder(
    (output_position_encodings): PerceiverTrainablePositionEncoding()
    (positions_projection): Identity()
    (decoding_cross_attention): PerceiverLayer(
      (attention): PerceiverAttention(
        (self): PerceiverSelfAttention(
          (layernorm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (layernorm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=True)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): PerceiverSelfOutput(
          (dense): Linear(in_features=1280, out_features=1280, bias=True)
        )
      )
      (layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (mlp): PerceiverMLP(
        (dense1): Linear(

# PerceiverTokenizer

In [20]:
tokenizer = PerceiverTokenizer()
tokenizer

PerceiverTokenizer(name_or_path='', vocab_size=262, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [22]:
text = "hello world"

In [23]:
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs

tensor([[  4, 110, 107, 114, 114, 117,  38, 125, 117, 120, 114, 106,   5]])

# PerceiverModel

In [24]:
model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
model

PerceiverModel(
  (input_preprocessor): PerceiverTextPreprocessor(
    (embeddings): Embedding(262, 768)
    (position_embeddings): Embedding(2048, 768)
  )
  (embeddings): PerceiverEmbeddings()
  (encoder): PerceiverEncoder(
    (cross_attention): PerceiverLayer(
      (attention): PerceiverAttention(
        (self): PerceiverSelfAttention(
          (layernorm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (layernorm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (query): Linear(in_features=1280, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): PerceiverSelfOutput(
          (dense): Linear(in_features=768, out_features=1280, bias=True)
        )
      )
      (layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (mlp): Percei

In [25]:
with torch.inference_mode():
    outputs = model(inputs=inputs)
outputs

PerceiverModelOutput(logits=tensor([[ 0.0690, -0.3086]]), last_hidden_state=tensor([[[-1.6973,  1.9563, -3.6782,  ..., -0.4137, -2.4918, -1.9118],
         [-1.8106,  1.9883, -3.6225,  ..., -0.7294, -2.3145, -1.7949],
         [-1.7977,  2.1399, -3.8856,  ..., -0.8173, -2.7854, -2.2094],
         ...,
         [-1.5733,  2.4623, -4.0288,  ..., -1.0537, -2.5059, -1.5030],
         [-2.0358,  2.3189, -3.3642,  ..., -0.8099, -2.8462, -1.9373],
         [-1.6094,  1.9983, -3.8460,  ..., -1.2038, -2.5860, -1.8186]]]), hidden_states=None, attentions=None, cross_attentions=None)

In [26]:
outputs.logits

tensor([[ 0.0690, -0.3086]])