In [2]:
import torch
from transformers import (
    Owlv2TextConfig,
    Owlv2TextModel,
    Owlv2VisionConfig,
    Owlv2VisionModel,
    Owlv2Config,
    Owlv2Model,
)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Owlv2TextConfig

In [6]:
# This is the configuration class to store the configuration of an Owlv2TextModel.
# It is used to instantiate an Owlv2 text encoder according to the specified arguments,
# defining the model architecture. Instantiating a configuration with the defaults will
# yield a similar configuration to that of the Owlv2 google/owlv2-base-patch32 architecture.
text_config = Owlv2TextConfig()
text_config

Owlv2TextConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 49406,
  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 512,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 16,
  "model_type": "owlv2_text_model",
  "num_attention_heads": 8,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.35.0",
  "vocab_size": 49408
}

In [7]:
text_model: Owlv2TextModel = Owlv2TextModel(text_config).to(device)
text_model

Owlv2TextModel(
  (text_model): Owlv2TextTransformer(
    (embeddings): Owlv2TextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(16, 512)
    )
    (encoder): Owlv2Encoder(
      (layers): ModuleList(
        (0-11): 12 x Owlv2EncoderLayer(
          (self_attn): Owlv2Attention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): Owlv2MLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,),

In [8]:
text_model.config

Owlv2TextConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 49406,
  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 512,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 16,
  "model_type": "owlv2_text_model",
  "num_attention_heads": 8,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.35.0",
  "vocab_size": 49408
}

# Owlv2VisionConfig

In [9]:
# This is the configuration class to store the configuration of an Owlv2VisionModel.
# It is used to instantiate an OWLv2 image encoder according to the specified arguments,
# defining the model architecture. Instantiating a configuration with the defaults will
# yield a similar configuration to that of the OWLv2 google/owlv2-base-patch32 architecture.
vision_config = Owlv2VisionConfig()
vision_config

Owlv2VisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "image_size": 768,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "model_type": "owlv2_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 32,
  "transformers_version": "4.35.0"
}

In [10]:
vision_model: Owlv2VisionModel = Owlv2VisionModel(vision_config).to(device)
vision_model

Owlv2VisionModel(
  (vision_model): Owlv2VisionTransformer(
    (embeddings): Owlv2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(577, 768)
    )
    (pre_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): Owlv2Encoder(
      (layers): ModuleList(
        (0-11): 12 x Owlv2EncoderLayer(
          (self_attn): Owlv2Attention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Owlv2MLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)

In [11]:
vision_model.config

Owlv2VisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "image_size": 768,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "model_type": "owlv2_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 32,
  "transformers_version": "4.35.0"
}

# Owlv2Config

In [13]:
# Owlv2Config is the configuration class to store the configuration of an Owlv2Model.
# It is used to instantiate an OWLv2 model according to the specified arguments,
# defining the text model and vision model configs. Instantiating a configuration
# with the defaults will yield a similar configuration to that of the OWLv2
# google/owlv2-base-patch32 architecture.
config = Owlv2Config()
config

Owlv2Config {
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "owlv2",
  "projection_dim": 512,
  "text_config": {
    "model_type": "owlv2_text_model"
  },
  "transformers_version": "4.35.0",
  "vision_config": {
    "model_type": "owlv2_vision_model"
  }
}

In [14]:
model: Owlv2Model = Owlv2Model(config).to(device)
model

Owlv2Model(
  (text_model): Owlv2TextTransformer(
    (embeddings): Owlv2TextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(16, 512)
    )
    (encoder): Owlv2Encoder(
      (layers): ModuleList(
        (0-11): 12 x Owlv2EncoderLayer(
          (self_attn): Owlv2Attention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): Owlv2MLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps

In [15]:
model.config

Owlv2Config {
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "owlv2",
  "projection_dim": 512,
  "text_config": {
    "model_type": "owlv2_text_model"
  },
  "transformers_version": "4.35.0",
  "vision_config": {
    "model_type": "owlv2_vision_model"
  }
}