In [50]:
import torch
import config as cfg
from model.CPTR_upd import CPTR
from tokenizer.tokenizer import TokenizerHF, ByteLevelBPE
from torchviz import make_dot

In [51]:
config = cfg.import_config("../results/config_20260121-200819/config_20260121-200819/config.json")
config["TOKENIZER_DATA_PATH"] = "tokenizer_data"
config["TOKENIZER_TYPE"] = cfg.TokenizerType.HF


In [52]:
special_tokens = [
    cfg.SpecialTokens.PAD,
    cfg.SpecialTokens.BOS,
    cfg.SpecialTokens.EOS
]

tokenizer = TokenizerHF()

pad_idx = tokenizer.get_padding_token_id()
vocab_size = tokenizer.get_vocab_size()


Initializing HF Tokenizer with special tokens: {'eos_token': '<eos>', 'bos_token': '<bos>', 'pad_token': '<pad>'}
HF Tokenizer initialized with vocab size: 50260, pad_token_id: 50259


In [53]:
model = CPTR(
    num_patches=config["NUM_PATCHES"],
    encoder_arch=config["ENCODER_ARCH"],
    encoding_strategy=config["VIT_ENCODING_STRATEGY"],
    use_embedding_projection=config["USE_PROJECTION_LAYER"],
    img_emb_use_conv=config["USE_CONV_IMG_EMBEDDING"],
    img_emb_dim=config["IMG_EMBEDDING_DIM"],
    patch_size=config["PATCH_SIZE"],
    text_emb_dim=config["TEXT_EMBEDDING_DIM"],
    d_model=config["EMBEDDING_DIM"],
    max_text_seq_len=config["MAX_TEXT_SEQUENCE_LENGTH"],
    vocab_size=vocab_size,
    pad_idx=pad_idx,
    channels=config["NUM_INPUT_CHANNELS"],
    num_encoder_blocks=config["ENCODER_NUM_BLOCKS"],
    num_encoder_heads=config["ENCODER_NUM_HEADS"],
    encoder_hidden_dim=config["ENCODER_HIDDEN_DIM"],
    encoder_dropout_prob=config["ENCODER_DROPOUT_PROB"],
    num_decoder_blocks=config["DECODER_NUM_BLOCKS"],
    num_decoder_heads=config["DECODER_NUM_HEADS"],
    decoder_hidden_dim=config["DECODER_HIDDEN_DIM"],
    decoder_dropout_prob=config["DECODER_DROPOUT_PROB"],
    bias=config["USE_BIAS"],
    use_weight_tying=config["USE_WEIGHT_TYING"],
    sublayer_dropout=config["SUBLAYER_DROPOUT"],
    verbose=False
)

model.eval()

Initialized CNN ResNet-50 Encoder


CPTR(
  (encoder): CNNEncoder(
    (backbone): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
         

In [54]:
dummy_img = torch.randn(
    1,
    3,
    config["IMG_HEIGHT"],
    config["IMG_WIDTH"]
)

dummy_txt = torch.randint(
    0,
    vocab_size,
    (1, min(10, config["MAX_TEXT_SEQUENCE_LENGTH"]))
)

In [55]:
output = model(dummy_img, dummy_txt)

if isinstance(output, tuple):
    output = next(o for o in output if torch.is_tensor(o))

elif isinstance(output, dict):
    output = output.get("logits", list(output.values())[0])

output = output.sum()

dot = make_dot(output, params=dict(model.named_parameters()))
dot.format = "png"
dot.render("CPTR_architecture")

dot.format = 'png' #"png" or "svg" possible
dot.render("CPTR_architecture")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.74802 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.74802 to fit


'CPTR_architecture.png'