# Image Captioning — Testing / Inference Notebook (Saved Weights)

This notebook recreates the **same model architecture** as the training notebook and then:

1. Loads the saved **vocabulary** (so token IDs match).
2. **Builds** the model once on dummy data.
3. Loads your saved **weights** (`.h5`).
4. Generates captions for new images.

In [1]:
import os
import json
import numpy as np
import tensorflow as tf
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input

print("TF version:", tf.__version__)

2026-02-02 10:31:40.877113: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-02 10:31:41.959348: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-02 10:31:44.963989: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  if not hasattr(np, "object"):


TF version: 2.20.0


In [2]:
WEIGHTS_PATH = r"model.weights.h5"
VOCAB_PATH   = r"vocab.json"

# Quick sanity checks:
print("Weights exists:", os.path.exists(WEIGHTS_PATH), WEIGHTS_PATH)
print("Vocab exists:", os.path.exists(VOCAB_PATH), VOCAB_PATH)

Weights exists: True model.weights.h5
Vocab exists: True vocab.json


In [3]:
#parameters
MAX_LENGTH = 40
VOCABULARY_SIZE = 15000
EMBEDDING_DIM = 512
UNITS = 512

## Rebuild Encoder and Decoder

In [4]:
def CNN_Encoder():
    inception_v3 = tf.keras.applications.InceptionV3(
        include_top = False,
        weights = 'imagenet'
    )

    output = inception_v3.output
    output = tf.keras.layers.Reshape((-1, output,shape[-1]))(output)

    model = tf.keras.models.Model(inception_v3.input, output)
    return model

In [6]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super().__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            units,
            return_sequences = True,
            return_state = True
        )
        self.fc1 = tf.keras.layers.Dense(units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

    def call(self, x, features, hidden):
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(features, 1), x], axis=-1)
        output, state = self.gru(x, initial_state=hidden)
        x = self.fc1(output)
        x = self.fc2(x)
        return x, state

In [8]:
#load vocab and rebuild tokenizer+ lookup
def load_vocab(vocab_path: str):
    # If you saved JSON: ["[pad]", "[start]", ...]
    if vocab_path.endswith(".json"):
        with open(vocab_path, "r", encoding="utf-8") as f:
            vocab = json.load(f)
        if not isinstance(vocab, list):
            raise ValueError("vocab.json should contain a JSON list of tokens.")
        return vocab

    # If you saved with pickle (like vocab_coco.file)
    if vocab_path.endswith(".file") or vocab_path.endswith(".pkl") or vocab_path.endswith(".pickle"):
        import pickle
        with open(vocab_path, "rb") as f:
            vocab = pickle.load(f)
        return vocab

    raise ValueError("Unknown vocab format. Use vocab.json or vocab_coco.file (pickle).")

vocab = load_vocab(VOCAB_PATH)
print("Vocab size loaded:", len(vocab))
print("First 20 tokens:", vocab[:20])

# Rebuild tokenizer so it maps words -> ids exactly the same way.
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCABULARY_SIZE,
    standardize=None,
    output_sequence_length=MAX_LENGTH,
)

# IMPORTANT:
# We DO NOT call tokenizer.adapt(...) in inference.
# Instead we set the vocabulary we loaded from training.
tokenizer.set_vocabulary(vocab)

# id <-> word helpers (same as your training notebook)
word2idx = tf.keras.layers.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary()
)

idx2word = tf.keras.layers.StringLookup(
    mask_token="",
    vocabulary=tokenizer.get_vocabulary(),
    invert=True
)

print("tokenizer.vocabulary_size():", tokenizer.vocabulary_size())


Vocab size loaded: 11839
First 20 tokens: ['', '[UNK]', 'a', '[start]', '[end]', 'on', 'of', 'the', 'in', 'with', 'and', 'is', 'man', 'to', 'sitting', 'an', 'two', 'at', 'people', 'standing']
tokenizer.vocabulary_size(): 11839


I0000 00:00:1769761747.072572   28015 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3584 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [9]:
#model architecture
def CNN_Encoder():
    inception_v3 = tf.keras.applications.InceptionV3(
        include_top=False,
        weights="imagenet"
    )
    output = inception_v3.output
    output = tf.keras.layers.Reshape((-1, output.shape[-1]))(output)
    cnn_model = tf.keras.models.Model(inception_v3.input, output)
    return cnn_model


class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.layer_norm_1 = tf.keras.layers.LayerNormalization()
        self.layer_norm_2 = tf.keras.layers.LayerNormalization()
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense = tf.keras.layers.Dense(embed_dim, activation="relu")

    def call(self, x, training=False):
        x = self.layer_norm_1(x)
        x = self.dense(x)
        attn_output = self.attention(
            query=x, value=x, key=x,
            attention_mask=None,
            training=training
        )
        x = self.layer_norm_2(x + attn_output)
        return x


class Embeddings(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embeddings = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.position_embeddings = tf.keras.layers.Embedding(max_len, embed_dim, input_shape=(None, max_len))

    def call(self, input_ids):
        length = tf.shape(input_ids)[-1]
        position_ids = tf.range(start=0, limit=length, delta=1)
        position_embeddings = self.position_embeddings(position_ids)
        token_embeddings = self.token_embeddings(input_ids)
        return token_embeddings + position_embeddings


class TransformerDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, units, num_heads):
        super().__init__()
        self.embedding = Embeddings(tokenizer.vocabulary_size(), embed_dim, MAX_LENGTH)

        self.attention_1 = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )
        self.attention_2 = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )

        self.layer_norm_1 = tf.keras.layers.LayerNormalization()
        self.layer_norm_2 = tf.keras.layers.LayerNormalization()
        self.layer_norm_3 = tf.keras.layers.LayerNormalization()

        self.ffn1 = tf.keras.layers.Dense(units, activation="relu")
        self.ffn2 = tf.keras.layers.Dense(embed_dim)

        self.dropout_1 = tf.keras.layers.Dropout(0.3)
        self.dropout_2 = tf.keras.layers.Dropout(0.5)
        self.dropout_3 = tf.keras.layers.Dropout(0.1)

        self.out = tf.keras.layers.Dense(tokenizer.vocabulary_size(), activation="softmax")

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]
        i = tf.range(seq_len)[:, None]
        j = tf.range(seq_len)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, seq_len, seq_len))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0
        )
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, training=False, mask=None):
        # inputs: (batch, seq_len)
        # encoder_outputs: (batch, image_seq, embed_dim)
        x = self.embedding(inputs)

        causal_mask = self.get_causal_attention_mask(x)
        if mask is not None:
            # mask: (batch, seq_len) -> (batch, 1, seq_len)
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            combined_mask = tf.minimum(padding_mask, causal_mask)
        else:
            combined_mask = causal_mask

        attn_1 = self.attention_1(
            query=x, value=x, key=x,
            attention_mask=combined_mask,
            training=training
        )
        x = self.layer_norm_1(x + attn_1)

        attn_2 = self.attention_2(
            query=x, value=encoder_outputs, key=encoder_outputs,
            attention_mask=padding_mask if mask is not None else None,
            training=training
        )
        x = self.layer_norm_2(x + attn_2)

        ffn_out = self.ffn1(x)
        ffn_out = self.dropout_1(ffn_out, training=training)
        ffn_out = self.ffn2(ffn_out)
        x = self.layer_norm_3(x + ffn_out)

        x = self.dropout_3(x, training=training)
        return self.out(x)


class ImageCaptioningModel(tf.keras.Model):
    def __init__(self, cnn_model, encoder, decoder, image_aug=None):
        super().__init__()
        self.cnn_model = cnn_model
        self.encoder = encoder
        self.decoder = decoder
        self.image_aug = image_aug
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")
        self.acc_tracker = tf.keras.metrics.Mean(name="accuracy")

    def calculate_loss(self, y_true, y_pred, mask):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def calculate_accuracy(self, y_true, y_pred, mask):
        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
        accuracy = tf.math.logical_and(mask, accuracy)
        accuracy = tf.cast(accuracy, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

    def compute_loss_and_acc(self, img_embed, captions, training=True):
        encoder_output = self.encoder(img_embed, training=training)
        y_input = captions[:, :-1]
        y_true = captions[:, 1:]
        mask = (y_true != 0)
        y_pred = self.decoder(y_input, encoder_output, training=training, mask=mask)
        loss = self.calculate_loss(y_true, y_pred, mask)
        acc = self.calculate_accuracy(y_true, y_pred, mask)
        return loss, acc

    def train_step(self, batch):
        imgs, captions = batch
        if self.image_aug:
            imgs = self.image_aug(imgs)

        img_embed = self.cnn_model(imgs)

        with tf.GradientTape() as tape:
            loss, acc = self.compute_loss_and_acc(img_embed, captions, training=True)

        train_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
        grads = tape.gradient(loss, train_vars)
        self.optimizer.apply_gradients(zip(grads, train_vars))

        self.loss_tracker.update_state(loss)
        self.acc_tracker.update_state(acc)
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    def test_step(self, batch):
        imgs, captions = batch
        img_embed = self.cnn_model(imgs)
        loss, acc = self.compute_loss_and_acc(img_embed, captions, training=False)
        self.loss_tracker.update_state(loss)
        self.acc_tracker.update_state(acc)
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    @property
    def metrics(self):
        return [self.loss_tracker, self.acc_tracker]


In [10]:
encoder = TransformerEncoderLayer(EMBEDDING_DIM, num_heads=1)
decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, num_heads=8)
cnn_model = CNN_Encoder()

caption_model = ImageCaptioningModel(cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=None)

# compile so `self.loss` exists (required by compute_loss_and_acc)
cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction="none")
caption_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=cross_entropy)

# --- BUILD STEP (very important for subclassed models) ---
# Create dummy inputs to force all variables to be created before load_weights()
dummy_imgs = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
dummy_captions = tf.zeros((1, MAX_LENGTH), dtype=tf.int64)

# Run 1 forward pass through the training logic (no gradients needed here)
img_embed = caption_model.cnn_model(dummy_imgs)
_ = caption_model.compute_loss_and_acc(img_embed, dummy_captions, training=False)

print("Built encoder vars:", len(caption_model.encoder.variables))
print("Built decoder vars:", len(caption_model.decoder.variables))

# Now load weights safely
caption_model.load_weights(WEIGHTS_PATH)
print("✅ Weights loaded!")


  super().__init__(**kwargs)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


2026-01-30 09:30:07.017566: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
2026-01-30 09:30:12.727630: W tensorflow/core/framework/op_kernel.cc:1842] INVALID_ARGUMENT: required broadcastable shapes
2026-01-30 09:30:12.727689: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: INVALID_ARGUMENT: required broadcastable shapes


InvalidArgumentError: Exception encountered when calling Softmax.call().

[1m{{function_node __wrapped__SelectV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} required broadcastable shapes [Op:SelectV2] name: [0m

Arguments received by Softmax.call():
  • inputs=tf.Tensor(shape=(1, 8, 39, 64), dtype=float32)
  • mask=tf.Tensor(shape=(1, 1, 1, 39), dtype=bool)