In [42]:
import torch
import torchvista

import importlib

import sys
sys.path.append("..")
import config as cfg

from model.CPTR_upd import CPTR
from tokenizer.tokenizer import TokenizerHF, ByteLevelBPE

from dataset.loader import DatasetLoader

In [43]:
importlib.reload(cfg)

<module 'config' from '/home/nad/studies/Transformer-Image-Captioning-IIW/visualization/../config.py'>

In [44]:
device = "cpu"

model_folder = cfg.CONFIG_ROOT / "experiments/config_20260207-182348"
config = cfg.import_config(model_folder / 'config.json')
model_path = model_folder / 'cptr_model.pth'

In [45]:
batch_size_train = config["BATCH_SIZE_TRAIN"]
batch_size_test = config["BATCH_SIZE_TEST"]

H = config["IMG_HEIGHT"]
W = config["IMG_WIDTH"]
P = config["PATCH_SIZE"]
D_IMG = config["IMG_EMBEDDING_DIM"]

# The data will get truncated/padded to this length AFTER tokenization
L = config["MAX_TEXT_SEQUENCE_LENGTH"]
D_TEXT = config["TEXT_EMBEDDING_DIM"]
DROPOUT_DEC = config["DECODER_DROPOUT_PROB"]
RANDOM_SEED = config["RANDOM_SEED"]

In [46]:
special_tokens = [
    cfg.SpecialTokens.PAD,
    cfg.SpecialTokens.BOS,
    cfg.SpecialTokens.EOS
]

if config["TOKENIZER_TYPE"] == cfg.TokenizerType.HF:
    tokenizer = TokenizerHF()
else:
    tokenizer = ByteLevelBPE(special_tokens=special_tokens)
    tokenizer.load(
        folder=cfg.TOKENIZER_DATA_PATH,
        filename_prefix=config["TOKENIZER_FILENAME_PREFIX"]
    )

pad_idx = tokenizer.get_padding_token_id()
vocab_size = tokenizer.get_vocab_size()


In [47]:
data_loader = DatasetLoader(dataset_type=config["DATASET"],
                            img_height=H,
                            img_width=W,
                            batch_size_train=batch_size_train, 
                            batch_size_test=batch_size_test,
                            split_ratio=config["SPLIT_RATIO"],
                            shuffle_test=True,
                            seed=RANDOM_SEED)
data_loader.load_data()

test_dataloader = data_loader.get_test_dataloader()

batch = next(iter(test_dataloader))
img_tensor = batch['pixel_values'][0].unsqueeze(0).to(device)

bos_token=tokenizer.get_vocab()[cfg.SpecialTokens.BOS.value]

tokens = torch.tensor(data=[[bos_token]], requires_grad=False).to(device)
attn_mask = torch.triu(torch.ones((1, 1), device=device, requires_grad=False), diagonal=1).bool()


Loading COCO dataset...


In [48]:
model = CPTR(
    num_patches=config["NUM_PATCHES"],
    encoder_arch=config["ENCODER_ARCH"],
    encoding_strategy=config["VIT_ENCODING_STRATEGY"],
    use_embedding_projection=config["USE_PROJECTION_LAYER"],
    img_emb_use_conv=config["USE_CONV_IMG_EMBEDDING"],
    img_emb_dim=config["IMG_EMBEDDING_DIM"],
    patch_size=config["PATCH_SIZE"],
    text_emb_dim=config["TEXT_EMBEDDING_DIM"],
    d_model=config["EMBEDDING_DIM"],
    max_text_seq_len=config["MAX_TEXT_SEQUENCE_LENGTH"],
    vocab_size=vocab_size,
    pad_idx=pad_idx,
    channels=config["NUM_INPUT_CHANNELS"],
    num_encoder_blocks=config["ENCODER_NUM_BLOCKS"],
    num_encoder_heads=config["ENCODER_NUM_HEADS"],
    encoder_hidden_dim=config["ENCODER_HIDDEN_DIM"],
    encoder_dropout_prob=config["ENCODER_DROPOUT_PROB"],
    num_decoder_blocks=config["DECODER_NUM_BLOCKS"],
    num_decoder_heads=config["DECODER_NUM_HEADS"],
    decoder_hidden_dim=config["DECODER_HIDDEN_DIM"],
    decoder_dropout_prob=config["DECODER_DROPOUT_PROB"],
    bias=config["USE_BIAS"],
    use_weight_tying=config["USE_WEIGHT_TYING"],
    sublayer_dropout=config["SUBLAYER_DROPOUT"],
    verbose=False
)

model.eval()

Initialized CPTR Encoder


CPTR(
  (encoder): CPTREncoder(
    (patcher): ConvPatcher(
      (conv): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    )
    (img_pos_embedding): LearnablePositionalEmbedding()
    (encoder_blocks): ModuleList(
      (0-7): 8 x CPTREncoderBlock(
        (MHSA): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=False)
        )
        (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (FFN): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=False)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=False)
        )
        (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (images_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (emb_projector): EmbeddingProjection(
    (projection): Linear(in_featu

In [None]:
torchvista.trace_model(
    model=model,
    inputs=(img_tensor, tokens, attn_mask),
    # export_format='html',
    # export_path='architecture_graphs/{}.html'.format(config['ENCODER_ARCH'])
)