In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
import pandas as pd
import os
from keras import layers

2024-04-11 12:16:45.197056: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 12:16:45.220605: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# ###CONSTANTS
# IMAGE_SIZE=(224, 224)
# VAL_FRACTION=0.05
# SEQ_LENGTH=100
# BATCH_SIZE=64
# EPOCHS=20
# AUTOTUNE=tf.data.AUTOTUNE
# ###
from train_config import *

In [3]:
captionings_df = pd.read_csv(os.path.join(DATA_PATH, "results.csv"), sep="|").dropna()
captionings_df.columns = ["image_name", "comment_number", "comment"]
captionings_df["image_name"] = IMAGES_PATH + "/" + captionings_df["image_name"] 


#ADDING START AND END special tokens
captionings_df["comment"] = "<START> " + captionings_df["comment"] + " <END>"
captionings_df.head(10)

Unnamed: 0,image_name,comment_number,comment
0,data/flickr30k_images/flickr30k_images//100009...,0,<START> Two young guys with shaggy hair look ...
1,data/flickr30k_images/flickr30k_images//100009...,1,"<START> Two young , White males are outside n..."
2,data/flickr30k_images/flickr30k_images//100009...,2,<START> Two men in green shirts are standing ...
3,data/flickr30k_images/flickr30k_images//100009...,3,<START> A man in a blue shirt standing in a g...
4,data/flickr30k_images/flickr30k_images//100009...,4,<START> Two friends enjoy time spent together...
5,data/flickr30k_images/flickr30k_images//100024...,0,<START> Several men in hard hats are operatin...
6,data/flickr30k_images/flickr30k_images//100024...,1,<START> Workers look down from up above on a ...
7,data/flickr30k_images/flickr30k_images//100024...,2,<START> Two men working on a machine wearing ...
8,data/flickr30k_images/flickr30k_images//100024...,3,<START> Four men on top of a tall structure ....
9,data/flickr30k_images/flickr30k_images//100024...,4,<START> Three men on a large rig . <END>


In [4]:
captionings_df["comment"].apply(lambda s : len(s.split(" "))).describe()

count    158914.000000
mean         16.389771
std           5.421389
min           5.000000
25%          13.000000
50%          15.000000
75%          19.000000
max          85.000000
Name: comment, dtype: float64

In [5]:
captionings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158914 entries, 0 to 158914
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   image_name      158914 non-null  object
 1   comment_number  158914 non-null  object
 2   comment         158914 non-null  object
dtypes: object(3)
memory usage: 4.8+ MB


In [6]:
#Shuffle df
captionings_df = captionings_df.sample(frac=1,
                                       random_state=42,
                                       replace=False,
                                       )


n_train_examples = int(len(captionings_df) * (1 - VAL_FRACTION))

train_captionings_df = captionings_df[ : n_train_examples]
val_captionings_df = captionings_df[n_train_examples : ]

print("Train image-text examples: ", train_captionings_df.shape[0])
print("Validation image-text examples: ", val_captionings_df.shape[0])

#save splits
train_captionings_df.to_csv("splits/train_captions.csv", index=False)
val_captionings_df.to_csv("splits/val_captions.csv", index=False)

Train image-text examples:  150968
Validation image-text examples:  7946


In [7]:
train_captionings_df.columns

Index(['image_name', 'comment_number', 'comment'], dtype='object')

## Preprocessing

In [8]:
from data_processing import build_tokenizer, build_image_augmenter,  decode_and_resize


tokenizer = build_tokenizer()
tokenizer.adapt(train_captionings_df["comment"].tolist())

2024-04-11 12:16:50.035710: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 12:16:50.054247: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-11 12:16:50.054324: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [9]:
print(tokenizer.get_vocabulary()[:100])

['', '[UNK]', 'a', 'end', 'start', 'in', 'the', 'on', 'and', 'man', 'is', 'of', 'with', 'woman', 'two', 'are', 'to', 'people', 'at', 'an', 'wearing', 'white', 'young', 'shirt', 'black', 'while', 'his', 'blue', 'red', 'sitting', 'girl', 'men', 'boy', 'standing', 'dog', 'playing', 'street', 'group', 'down', 'front', 'her', 'walking', 'holding', 'one', 'water', 'by', 'three', 'women', 'green', 'up', 'looking', 'child', 'as', 'for', 'little', 'large', 'outside', 'yellow', 'person', 'children', 'brown', 'through', 'hat', 'their', 'from', 'other', 'ball', 'small', 'next', 'into', 'over', 'some', 'dressed', 'out', 'another', 'running', 'building', 'jacket', 'riding', 'around', 'orange', 'near', 'field', 'crowd', 'stands', 'beach', 'background', 'pink', 'sidewalk', 'behind', 'jumping', 'girls', 'table', 'sits', 'grass', 'bike', 'snow', 'that', 'looks', 'top']


In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [11]:
tokenizer("a tall man with blue t-shirt and a dog")

<tf.Tensor: shape=(32,), dtype=int64, numpy=
array([  2, 411,   9,  12,  27, 181,   8,   2,  34,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0])>

In [143]:
# def process_input(img_path, captions):
#     return decode_and_resize(img_path), tokenizer(captions)

def process_input(img_path, captions):
    return decode_and_resize(img_path), tf.reshape(tokenizer(captions), shape=(1, SEQ_LENGTH))

def make_dataset(images, captions):
    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
    dataset = dataset.shuffle(BATCH_SIZE * 8)
    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)

    return dataset

In [144]:
train_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())

val_dataset = make_dataset(train_captionings_df["image_name"].tolist(),
                             train_captionings_df["comment"].tolist())



In [145]:
for img, cap in val_dataset.take(5):
    print(img.shape, cap.shape)

(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)
(64, 224, 224, 3) (64, 1, 32)


https://keras.io/examples/vision/image_captioning/

In [146]:
def get_cnn_model(base_model):
    
    # We freeze our feature extractor
    base_model.trainable = False
    base_model_out = base_model.output #output: (7, 7, 1280)
    base_model_out = keras.layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out) #output: (49, 1280)
    
    #Important: We feed to traansformer not an embedding vector, but rather unrolled matrix from conv volume
    cnn_model = keras.models.Model(base_model.input, base_model_out)
    return cnn_model


In [147]:


base_model = keras.applications.efficientnet.EfficientNetB1(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights="imagenet",
    )

cnn = get_cnn_model(base_model)

In [148]:
cnn.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 rescaling_20 (Rescaling)    (None, 224, 224, 3)          0         ['input_15[0][0]']            
                                                                                                  
 normalization_10 (Normaliz  (None, 224, 224, 3)          7         ['rescaling_20[0][0]']        
 ation)                                                                                           
                                                                                                  
 rescaling_21 (Rescaling)    (None, 224, 224, 3)          0         ['normalization_10[0][0

### Transformers parts


In [149]:
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.0
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.dense_1 = layers.Dense(embed_dim, activation="relu")

    def call(self, inputs, training, mask=None):
        
        inputs = self.layernorm_1(inputs)
        inputs = self.dense_1(inputs) #transform inputts to embeding_dim

        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=None,
            training=training,
                    ) #apply attention
        
        out_1 = self.layernorm_2(inputs + attention_output_1) #apply layernorm + residual connection
        return out_1

In [150]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_tokens = embedded_tokens * self.embed_scale
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

In [316]:
class TransformerDecoderBlock(layers.Layer):
    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )
        self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
        self.ffn_layer_2 = layers.Dense(embed_dim)

        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

        self.embedding = PositionalEmbedding(
            embed_dim=EMBED_DIM,
            sequence_length=SEQ_LENGTH,
            vocab_size=VOCAB_SIZE,
        )
        self.out = layers.Dense(VOCAB_SIZE, activation="softmax")

        self.dropout_1 = layers.Dropout(0.3)
        self.dropout_2 = layers.Dropout(0.5)
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, training, mask=None):
        inputs = self.embedding(inputs)
        causal_mask = self.get_causal_attention_mask(inputs)

        #added default values of masks
        combined_mask=None
        padding_mask=None
        if mask is not None:
            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
            combined_mask = tf.minimum(combined_mask, causal_mask)
            

        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
           attention_mask=combined_mask,
            training=training,
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
            training=training,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        ffn_out = self.ffn_layer_1(out_2)
        ffn_out = self.dropout_1(ffn_out, training=training)
        ffn_out = self.ffn_layer_2(ffn_out)

        ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
        ffn_out = self.dropout_2(ffn_out, training=training)
        preds = self.out(ffn_out)
        return preds

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size, -1),
                tf.constant([1, 1], dtype=tf.int32),
            ],
            axis=0,
        )
        return tf.tile(mask, mult)

In [317]:
class ImageCaptioningModel(keras.Model):
    def __init__(
        self, cnn_model, encoder, decoder, num_captions_per_image=1,
    ):
        super().__init__()
        self.cnn_model = cnn_model
        self.encoder = encoder
        self.decoder = decoder
        self.loss_tracker = keras.metrics.Mean(name="loss")
        self.acc_tracker = keras.metrics.Mean(name="accuracy")
        self.num_captions_per_image = num_captions_per_image


    def call(self, inputs):
        x = self.cnn_model(inputs[0])
        x = self.encoder(x, False)
        x = self.decoder(inputs[2],x,training=inputs[1],mask=None)
        return x

    def calculate_loss(self, y_true, y_pred, mask):
        loss = self.loss(y_true, y_pred)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def calculate_accuracy(self, y_true, y_pred, mask):
        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
        accuracy = tf.math.logical_and(mask, accuracy)
        accuracy = tf.cast(accuracy, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)

    def train_step(self, batch_data):
        batch_img, batch_seq = batch_data
        batch_loss = 0
        batch_acc = 0

        # 1. Get image embeddings
        img_embed = self.cnn_model(batch_img)

        # 2. Pass each of the five captions one by one to the decoder
        # along with the encoder outputs and compute the loss as well as accuracy
        # for each caption.
        for i in range(self.num_captions_per_image):
            with tf.GradientTape() as tape:
                # 3. Pass image embeddings to encoder
                encoder_out = self.encoder(img_embed, training=True)

                batch_seq_inp = batch_seq[:, i, :-1]
                batch_seq_true = batch_seq[:, i, 1:]

                # 4. Compute the mask for the input sequence
                mask = tf.math.not_equal(batch_seq_inp, 0)

                # 5. Pass the encoder outputs, sequence inputs along with
                # mask to the decoder
                batch_seq_pred = self.decoder(
                    batch_seq_inp, encoder_out, training=True, mask=mask
                )

                # 6. Calculate loss and accuracy
                caption_loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
                caption_acc = self.calculate_accuracy(
                    batch_seq_true, batch_seq_pred, mask
                )

                # 7. Update the batch loss and batch accuracy
                batch_loss += caption_loss
                batch_acc += caption_acc

            # 8. Get the list of all the trainable weights
            train_vars = (
                self.encoder.trainable_variables + self.decoder.trainable_variables
            )

            # 9. Get the gradients
            grads = tape.gradient(caption_loss, train_vars)

            # 10. Update the trainable weights
            self.optimizer.apply_gradients(zip(grads, train_vars))

        loss = batch_loss
        acc = batch_acc / float(self.num_captions_per_image)

        self.loss_tracker.update_state(loss)
        self.acc_tracker.update_state(acc)
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    def test_step(self, batch_data):
        batch_img, batch_seq = batch_data
        batch_loss = 0
        batch_acc = 0

        # 1. Get image embeddings
        img_embed = self.cnn_model(batch_img)

        # 2. Pass each of the five captions one by one to the decoder
        # along with the encoder outputs and compute the loss as well as accuracy
        # for each caption.
        for i in range(self.num_captions_per_image):
            # 3. Pass image embeddings to encoder
            encoder_out = self.encoder(img_embed, training=False)

            batch_seq_inp = batch_seq[:, i, :-1]
            batch_seq_true = batch_seq[:, i, 1:]

            # 4. Compute the mask for the input sequence
            mask = tf.math.not_equal(batch_seq_inp, 0)

            # 5. Pass the encoder outputs, sequence inputs along with
            # mask to the decoder
            batch_seq_pred = self.decoder(
                batch_seq_inp, encoder_out, training=False, mask=mask
            )

            # 6. Calculate loss and accuracy
            caption_loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
            caption_acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)

            # 7. Update the batch loss and batch accuracy
            batch_loss += caption_loss
            batch_acc += caption_acc

        loss = batch_loss
        acc = batch_acc / float(self.num_captions_per_image)

        self.loss_tracker.update_state(loss)
        self.acc_tracker.update_state(acc)
        return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker, self.acc_tracker]


In [318]:
base_model = keras.applications.efficientnet.EfficientNetB0(
        input_shape=(*IMAGE_SIZE, 3),
        include_top=False,
        weights="imagenet",
    )


In [319]:


encoder = TransformerEncoderBlock(
    embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1
)
decoder = TransformerDecoderBlock(
    embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2, 
)

caption_model = ImageCaptioningModel(
    cnn_model=cnn_model,
    #image_aug=None,
    encoder=encoder, 
    decoder=decoder
)



In [320]:
cross_entropy = keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    reduction="none"
)

In [321]:
caption_model.compile(optimizer=keras.optimizers.Adam(0.01), loss=cross_entropy)

In [322]:
X_batch, y_batch = next(iter(train_dataset.take(1)))

#y_batch = tf.reshape(y_batch, (64, 1 , 32))

In [323]:
X_batch.shape, y_batch.shape

(TensorShape([64, 224, 224, 3]), TensorShape([64, 1, 32]))

https://www.kaggle.com/code/burhanuddinlatsaheb/image-captioning-vit-gpt2#5.-MODEL-BUILDING

In [324]:
caption_model.fit(X_batch, y_batch, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x790f93b421d0>

In [325]:
encoder = TransformerEncoderBlock(
    embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1
)
decoder = TransformerDecoderBlock(
    embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2, 
)

caption_model = ImageCaptioningModel(
    cnn_model=cnn_model,
    encoder=encoder, 
    decoder=decoder
)

cnn_input = tf.keras.layers.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
training = False
decoder_input = tf.keras.layers.Input(shape=(None,))
caption_model([cnn_input,  training, decoder_input])

<KerasTensor: shape=(None, None, 10000) dtype=float32 (created by layer 'image_captioning_model_34')>

In [326]:
caption_model.summary()

Model: "image_captioning_model_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_16 (Functional)       (None, 49, 1280)          4049571   
                                                                 
 transformer_encoder_block_  multiple                  594176    
 43 (TransformerEncoderBloc                                      
 k)                                                              
                                                                 
 transformer_decoder_block_  multiple                  6454800   
 40 (TransformerDecoderBloc                                      
 k)                                                              
                                                                 
Total params: 11098551 (42.34 MB)
Trainable params: 7048976 (26.89 MB)
Non-trainable params: 4049575 (15.45 MB)
_____________________________________________________________

In [327]:
caption_model.save_weights("captionmodel_test.h5")

In [328]:
caption_model.load_weights("captionmodel_test.h5")

In [331]:
new_caption_model =  ImageCaptioningModel(
    cnn_model=cnn_model,
    #image_aug=None,
    encoder=encoder, 
    decoder=decoder )

cnn_input = tf.keras.layers.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
training = False
decoder_input = tf.keras.layers.Input(shape=(None,))
new_caption_model([cnn_input,  training, decoder_input])

<KerasTensor: shape=(None, None, 10000) dtype=float32 (created by layer 'image_captioning_model_36')>

In [332]:
new_caption_model.load_weights("captionmodel_test.h5")

In [339]:
new_caption_model.encoder.dense_1

<keras.layers.core.dense.Dense at 0x790ed4588c10>