In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.layers import Input, Dense, LSTM , Embedding , Reshape , Concatenate , Add , MultiHeadAttention , Layer , Lambda  ,Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.models import Model
import numpy as np
import os
import tensorflow as tf

with open('/dataset flickr8k/captions.txt','r') as f:
    text = f.read()

img_to_cap = {}

text = text.split('\n')
text = text[1:-1]

for texts in text:
  texts = texts.split(',')
  img_to_cap[texts[0]] = texts[1]


captions = []
batch_size = 5

image_datagen = ImageDataGenerator(preprocessing_function = resnet_preprocess)
images = image_datagen.flow_from_directory(
    '/dataset flickr8k/Images',
    target_size = (224,224),
    batch_size = len(os.listdir('/dataset flickr8k/Images/train')),
    class_mode = None,
    shuffle = False,
)


for img_path in images.filenames:
    captions.append('ssss ' + img_to_cap[img_path.split('/')[1]] + ' eeee')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
sequence = tokenizer.texts_to_sequences(captions)
padded_seq = pad_sequences(sequence,padding = 'post')
max_seq = padded_seq.shape[1]


def train_gen(image_gen , x_tokenizer , cap , batch_size):
  x_img = []
  x_train = []
  y_train = []
  while True:
    for i in range(len(cap)):
      seq = x_tokenizer.texts_to_sequences([cap[i]])[0]
      for j in range(1,len(seq)):
        x = np.array([seq[:j]])
        y = np.array([seq[j]])
        x = pad_sequences(x, maxlen = max_seq, padding = 'post')[0]
        #y = pad_sequences(y , maxlen = max_seq, padding = 'post')[0]
        x_img.append(image_gen[0][i])
        x_train.append(x)
        y_train.append(y)
        if len(x_img) == batch_size:
          yield (np.array(x_img),np.array(x_train)),np.array(y_train)
          x_img = []
          x_train = []
          y_train = []

train_data = train_gen(images , tokenizer , captions , batch_size)

total_samples = 0

for cap in captions:
    seq_len = len(tokenizer.texts_to_sequences([cap])[0])
    if seq_len > 1:
        total_samples += (seq_len - 1)

steps_per_epoch = total_samples // batch_size
print(steps_per_epoch)



Found 4 images belonging to 1 classes.
9


In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.layers import Input, Dense, LSTM , Embedding , Reshape , Concatenate , Add , MultiHeadAttention , Layer , Lambda  ,Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.models import Model
import numpy as np
import os
import tensorflow as tf

vocab_size = len(tokenizer.word_index) + 1
max_seq = padded_seq.shape[1]
d_model = 256
num_head = 4

class PaddingMask(Layer):
  def call(self, inputs):
    padding_mask = tf.math.not_equal(inputs, 0)
    padding_mask = tf.expand_dims(padding_mask,axis = -1)
    return padding_mask


input_img = Input(shape = (224,224,3))
cnn = ResNet50(include_top = False, weights = 'imagenet', input_shape = (224,224,3))
features = cnn(input_img)
features = Reshape((features.shape[1]*features.shape[2],features.shape[3]))(features)
features = Dense(d_model,activation = 'relu')(features)


input_seq = Input(shape = (max_seq,))
embedding = Embedding(vocab_size,d_model , mask_zero = True)(input_seq)
mask = PaddingMask()(input_seq)
lstm_out , _ , _ = LSTM(d_model, return_sequences= True , return_state= True)(embedding)


attn = MultiHeadAttention(num_heads = num_head, key_dim = d_model)(query=lstm_out, value=features, key=features , attention_mask = mask)
attn = Add()([attn,lstm_out])
attn = Lambda(lambda x: x[:, -1, :])(attn)
output = Dense(vocab_size,activation = 'softmax')(attn)

model = Model(inputs = [input_img,input_seq], outputs = output)
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])
model.summary()

print(max_seq,images[0].shape)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step




15 (4, 224, 224, 3)


In [3]:
model.fit(train_data,steps_per_epoch=steps_per_epoch,epochs=50)

Epoch 1/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 4s/step - accuracy: 0.0044 - loss: 4.2326
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 4s/step - accuracy: 0.0602 - loss: 3.5779
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.0602 - loss: 3.4900
Epoch 4/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.0602 - loss: 3.3079
Epoch 5/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.1583 - loss: 3.1363
Epoch 6/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.2337 - loss: 2.8936
Epoch 7/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.2519 - loss: 2.6550
Epoch 8/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4s/step - accuracy: 0.3393 - loss: 2.2089
Epoch 9/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0

<keras.src.callbacks.history.History at 0x7ca57eea1100>

In [None]:
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

img = load_img('/dataset flickr8k/Images/train/510510783_b2cf5d57bb.jpg',target_size = (224,224))
img = img_to_array(img)
img = np.expand_dims(img,axis = 0)
img = resnet_preprocess(img)


def generate_caption(model, image, tokenizer, max_seq_length):
  start = 'ssss'
  for i in range(max_seq_length):

    seq_input = tokenizer.texts_to_sequences([start])[0]
    seq_input = pad_sequences([seq_input], maxlen=max_seq_length, padding='post')

    predictions = model.predict(
            [image, seq_input], verbose = 0
        )
    print(np.argmax(predictions , axis = -1))

    predicted_word_idx = np.argmax(
            predictions , axis = -1
        )[0]


    if predicted_word_idx != 0:
      if tokenizer.index_word[predicted_word_idx] == 'eeee':
          break
      start += ' ' + tokenizer.index_word[predicted_word_idx]
  return start[5:]

print(generate_caption( model, img, tokenizer, max_seq))
print(img_to_cap['510510783_b2cf5d57bb.jpg'])

[14]
[14]
[16]
[17]
[18]
[6]
[19]
[20]
[3]
child child upside down from a tree swing
child hanging upside down from a tree swing
