In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle_API/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import kaggle

In [None]:
!kaggle datasets download -d adityajn105/flickr8k

Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
Downloading flickr8k.zip to /content
 98% 1.01G/1.04G [00:12<00:00, 194MB/s]
100% 1.04G/1.04G [00:12<00:00, 89.5MB/s]


In [None]:
%%capture
!unzip flickr8k.zip -d dataset

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import re
import string
import os
import einops

In [None]:
def read_data(txt_file):
  with open(txt_file) as f:
    lines = f.read().splitlines()
    lines = lines[1: ]

    data = []
    target = []

    for line in lines:
      d, t = line.split(',', 1)
      data.append(d.strip())
      target.append(t.strip())

    return np.array(data), np.array(target)

In [None]:
data, target = read_data('dataset/captions.txt')

In [None]:
# data = data.reshape(1, -1)
# target = target.reshape(1, -1)

In [None]:
def custom_standard(text):
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, f'[{re.escape(string.punctuation)}]', '')
  text = tf.strings.strip(text)
  text = tf.strings.join(['[START]', text, '[END]'], separator = ' ')
  return text

In [None]:
text_vector = tf.keras.layers.TextVectorization(
    max_tokens = 8000,
    ragged = True,
    standardize = custom_standard
)

In [None]:
text_vector.adapt(target)

In [None]:
word_to_idx = tf.keras.layers.StringLookup(
    vocabulary = text_vector.get_vocabulary(),
    mask_token = ""
)
idx_to_word = tf.keras.layers.StringLookup(
    vocabulary = text_vector.get_vocabulary(),
    mask_token = "",
    invert = True
)

In [None]:
image_encoder = tf.keras.applications.MobileNetV3Small(
    include_top = False,
    weights = 'imagenet',
    input_shape = (224, 224, 3),
    include_preprocessing = True
)

# image_encoder = tf.keras.Sequential([
#     image_encoder,
#     tf.keras.layers.GlobalAveragePooling2D()
# ])

image_encoder.trainable = False

In [None]:
print(image_encoder.summary())

None


In [None]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    return img

In [None]:
img = load_image(r'/content/dataset/Images/1001773457_577c3a7d70.jpg')[tf.newaxis, :]
res = image_encoder(img)
print(res.shape)

(1, 7, 7, 576)


In [None]:
falt = tf.keras.layers.Flatten()
res3 = falt(res)
print(res3.shape)

(1, 28224)


In [None]:
image = einops.rearrange(res, 'b h w c -> b (h w) c')
print(image.shape)

(1, 49, 576)


In [None]:
def prepare_data(image, text):
  image_path = tf.strings.join(['/content/dataset/Images', image], separator = os.path.sep)
  image = tf.io.read_file(image_path)
  image = tf.image.decode_jpeg(image, channels = 3)
  image = tf.image.resize(image, (224, 224))
  image = tf.cast(image, tf.float32) / 255.0

  text = text_vector(text)
  text_in = text[:, :-1]
  text_out = text[:, 1:]
  text_in = text_in.to_tensor(default_value=0, shape=[None, 50])
  text_out = text_out.to_tensor(default_value=0, shape=[None, 50])
  text_in = tf.squeeze(text_in, axis=0)
  text_out = tf.squeeze(text_out, axis=0)

  return (image, text_in), text_out


In [None]:
target = target.reshape(-1, 1)

In [None]:
print(target)

[['A child in a pink dress is climbing up a set of stairs in an entry way .']
 ['A girl going into a wooden building .']
 ['A little girl climbing into a wooden playhouse .']
 ...
 ['A person in a red shirt climbing up a rock face covered in assist handles .']
 ['A rock climber in a red shirt .']
 ['A rock climber practices on a rock climbing wall .']]


In [None]:
print(data.shape)

(40455,)


In [None]:
train_list = int(len(target) * 0.8)

train_data = tf.data.Dataset.from_tensor_slices((data[: train_list], target[:train_list]))
val_data = tf.data.Dataset.from_tensor_slices((data[train_list: ], target[train_list: ]))

train_data = (
    train_data.map(prepare_data, num_parallel_calls = tf.data.AUTOTUNE).batch(32).prefetch(tf.data.AUTOTUNE)
)

val_data = (
    val_data.map(prepare_data, num_parallel_calls = tf.data.AUTOTUNE).batch(32).prefetch(tf.data.AUTOTUNE)
)

In [None]:
for inp, out in train_data.take(1):
  img, text = inp
  print(img.shape)
  print(text.shape)
  print(out.shape)
  print(tf.strings.reduce_join(idx_to_word(out[0]), separator = " ").numpy())
  print(tf.strings.reduce_join(idx_to_word(text[0]), separator = " ").numpy())

(32, 224, 224, 3)
(32, 50)
(32, 50)
b'a child in a pink dress is climbing up a set of stairs in an entry way [END]                                '
b'[START] a child in a pink dress is climbing up a set of stairs in an entry way                                '


In [None]:
class Embedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, embed_dim, max_len):
    super().__init__()
    self.embed = tf.keras.layers.Embedding(vocab_size, output_dim = embed_dim, mask_zero = True)
    self.pos_embed = tf.keras.layers.Embedding(input_dim = max_len, output_dim = embed_dim)
    self.add = tf.keras.layers.Add()

  def call(self, x):
    x = self.embed(x)

    pos = tf.range(tf.shape(x)[-1])
    pso = pos[tf.newaxis, :]
    pos = self.pos_embed(pos)

    return self.add([x, pos])

In [None]:
class causalSelfAtten(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

  def call(self, x):
    attn = self.mha(query = x, value = x, use_causal_mask = True)
    x = self.add([x, attn])

    return self.layernorm(x)

In [None]:
class crossAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, x, y, **kwargs):
        attn, score = self.mha(query=x, value=y, return_attention_scores=True)
        x = self.add([x, attn])
        return self.layernorm(x)

In [None]:
class feedForward(tf.keras.layers.Layer):
  def __init__(self, dff):
    super().__init__()
    self.model = tf.keras.Sequential([
        tf.keras.layers.Dense(2 * dff, activation = 'relu'),
        tf.keras.layers.Dense(dff),
        tf.keras.layers.Dropout(0.1)
    ])
    self.layerNorm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = x + self.model(x)
    return self.layerNorm(x)

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_heads, dff):
    super().__init__()

    self.self_attn = causalSelfAtten(num_heads = num_heads, key_dim = dff)
    self.ffn = feedForward(dff)
    self.cross_attn = crossAttention(num_heads=num_heads, key_dim = dff)

  def call(self, inp):
    img, text = inp

    text = self.self_attn(text)
    text = self.cross_attn(text, img)
    text = self.ffn(text)

    return text

In [None]:
class C_Model(tf.keras.Model):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun

  def __init__(self, text_vector, image_encoder, num_layers = 1, dff = 256, max_len = 50, num_heads = 1):
    super().__init__()
    self.text_vector = text_vector
    self.image_encoder = image_encoder
    self.emb = Embedding(text_vector.vocabulary_size(), dff, max_len)
    self.decoder = [Decoder(num_heads, dff) for _ in range(num_layers)]
    self.out_layer = tf.keras.layers.Dense(text_vector.vocabulary_size())

  def call(self, inp):
    img, text = inp

    img = self.image_encoder(img)
    img = einops.rearrange(img, 'b h w c -> b (h w) c')

    text = self.emb(text)

    for decoder in self.decoder:
      text = decoder((img, text))

    text = self.out_layer(text)

    return text

In [None]:
caption_model = C_Model(text_vector, image_encoder, num_layers = 2, num_heads = 2)

In [None]:
def masked_loss(y_true, y_pred):
  loss_fnc = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')
  loss = loss_fnc(y_true, y_pred)

  mask = tf.cast(y_true != 0, loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)

  return loss

def masked_acc(y_true, y_pred):
  y_pred = tf.argmax(y_pred, axis=-1)
  y_pred = tf.cast(y_pred, y_true.dtype)

  match = tf.cast(y_true == y_pred, tf.float32)
  mask = tf.cast(y_true != 0, tf.float32)

  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
caption_model.compile(optimizer = tf.keras.optimizers.Adam(1e-4), loss = masked_loss, metrics = [masked_acc])

In [None]:
his = caption_model.fit(train_data, epochs = 5, validation_data = val_data)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node gradient_tape/c__model_8_1/embedding_24_1/add_40_1/Add/BroadcastGradientArgs defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 699, in <lambda>

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 750, in _run_callback

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 824, in inner

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 785, in run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-101-ac8ba20fbf53>", line 1, in <cell line: 0>

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 77, in train_step

Incompatible shapes: [32,50,256] vs. [256,1,256]
	 [[{{node gradient_tape/c__model_8_1/embedding_24_1/add_40_1/Add/BroadcastGradientArgs}}]] [Op:__inference_multi_step_on_iterator_70813]