https://huggingface.co/docs/transformers/main/model_doc/perceiver

In [1]:
import torch
from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverImageProcessor, PerceiverModel
from transformers.models.perceiver.modeling_perceiver import (
    PerceiverTextPreprocessor,
    PerceiverImagePreprocessor,
    PerceiverClassificationDecoder,
)
from PIL import Image
import requests

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# EXAMPLE 1: using the Perceiver to classify texts
# - we define a TextPreprocessor, which can be used to embed tokens
# - we define a ClassificationDecoder, which can be used to decode the
# final hidden states of the latents to classification logits
# using trainable position embeddings

In [4]:
version = "deepmind/language-perceiver"

# PerceiverConfig

In [5]:
config = PerceiverConfig()
config

PerceiverConfig {
  "_label_trainable_num_channels": 1024,
  "attention_probs_dropout_prob": 0.1,
  "audio_samples_per_frame": 1920,
  "cross_attention_shape_for_attention": "kv",
  "cross_attention_widening_factor": 1,
  "d_latents": 1280,
  "d_model": 768,
  "hidden_act": "gelu",
  "image_size": 56,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "perceiver",
  "num_blocks": 1,
  "num_cross_attention_heads": 8,
  "num_frames": 16,
  "num_latents": 256,
  "num_self_attends_per_block": 26,
  "num_self_attention_heads": 8,
  "output_num_channels": 512,
  "output_shape": [
    1,
    16,
    224,
    224
  ],
  "qk_channels": null,
  "samples_per_patch": 16,
  "self_attention_widening_factor": 1,
  "train_size": [
    368,
    496
  ],
  "transformers_version": "4.39.0",
  "use_query_residual": true,
  "v_channels": null,
  "vocab_size": 262
}

# PerceiverTextPreprocessor

In [8]:
preprocessor: PerceiverTextPreprocessor = PerceiverTextPreprocessor(config)
preprocessor.to(device, torch.float16)

PerceiverTextPreprocessor(
  (embeddings): Embedding(262, 768)
  (position_embeddings): Embedding(2048, 768)
)

# PerceiverClassificationDecoder

In [9]:
decoder = PerceiverClassificationDecoder(
    config,
    num_channels=config.d_latents,
    trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
    use_query_residual=True,
).to(device, torch.float16)
decoder

PerceiverClassificationDecoder(
  (decoder): PerceiverBasicDecoder(
    (output_position_encodings): PerceiverTrainablePositionEncoding()
    (positions_projection): Identity()
    (decoding_cross_attention): PerceiverLayer(
      (attention): PerceiverAttention(
        (self): PerceiverSelfAttention(
          (layernorm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (layernorm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=True)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): PerceiverSelfOutput(
          (dense): Linear(in_features=1280, out_features=1280, bias=True)
        )
      )
      (layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (mlp): PerceiverMLP(
        (dense1): Linear(

# PerceiverTokenizer

In [10]:
tokenizer = PerceiverTokenizer()
tokenizer

PerceiverTokenizer(name_or_path='', vocab_size=256, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	5: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [11]:
text = "hello world"

In [18]:
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)
inputs

tensor([[  4, 110, 107, 114, 114, 117,  38, 125, 117, 120, 114, 106,   5]],
       device='cuda:0')

# PerceiverModel

In [16]:
model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder).to(device, torch.float16)
model

PerceiverModel(
  (input_preprocessor): PerceiverTextPreprocessor(
    (embeddings): Embedding(262, 768)
    (position_embeddings): Embedding(2048, 768)
  )
  (embeddings): PerceiverEmbeddings()
  (encoder): PerceiverEncoder(
    (cross_attention): PerceiverLayer(
      (attention): PerceiverAttention(
        (self): PerceiverSelfAttention(
          (layernorm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (layernorm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (query): Linear(in_features=1280, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): PerceiverSelfOutput(
          (dense): Linear(in_features=768, out_features=1280, bias=True)
        )
      )
      (layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (mlp): Percei

In [19]:
with torch.inference_mode():
    outputs = model(inputs=inputs)
outputs

PerceiverModelOutput(logits=tensor([[-0.5283, -0.2347]], device='cuda:0', dtype=torch.float16), last_hidden_state=tensor([[[-0.0134,  4.4609, -2.7871,  ...,  2.2480, -1.4268,  0.6777],
         [ 0.7168,  4.2617, -2.7109,  ...,  2.6523, -1.6992,  0.2996],
         [-0.0256,  4.5508, -2.9453,  ...,  2.2305, -0.8931,  0.3347],
         ...,
         [-0.4036,  4.7227, -2.8164,  ...,  2.2754, -1.0645,  0.6475],
         [-0.2010,  4.6328, -3.2773,  ...,  2.2422, -1.4541,  0.3049],
         [-0.3857,  4.3711, -3.1055,  ...,  2.5254, -0.9180,  0.7041]]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None, cross_attentions=None)

In [20]:
outputs.logits

tensor([[-0.5283, -0.2347]], device='cuda:0', dtype=torch.float16)